From 3c59694025387fda56474e882f3f55e6fc5d46b4 Mon Sep 17 00:00:00 2001
From: Kainan Cha <kainan.zha@verisilicon.com>
Date: Wed, 23 Jun 2021 15:26:25 +0800
Subject: [PATCH] Update internal to 1.1.32

SHA: 9aa0b0f

Signed-off-by: Kainan Cha <kainan.zha@verisilicon.com>
---
 src/tim/vx/internal/BUILD                     |   11 +-
 src/tim/vx/internal/CMakeLists.txt            |    2 -
 src/tim/vx/internal/include/interface/ops.def |    9 +
 .../internal/include/kernel/vsi_nn_kernel.h   |    6 +
 .../{client => libnnext}/vsi_nn_vxkernel.h    |    0
 .../internal/include/ops/vsi_nn_op_conv1d.h   |   20 +-
 .../include/ops/vsi_nn_op_grouped_conv1d.h    |   55 +
 .../include/ops/vsi_nn_op_groupnormalize.h    |   53 +
 .../internal/include/ops/vsi_nn_op_moments.h  |    6 +-
 .../{vsi_nn_post.h => ops/vsi_nn_op_nms.h}    |   18 +-
 .../internal/include/ops/vsi_nn_op_one_hot.h  |   42 +
 .../vx/internal/include/ops/vsi_nn_op_pool.h  |   12 +-
 .../internal/include/ops/vsi_nn_op_repeat.h   |   54 +
 .../include/ops/vsi_nn_op_sequence_mask.h     |   43 +
 .../include/ops/vsi_nn_op_strided_slice.h     |   19 +
 .../internal/include/ops/vsi_nn_op_upsample.h |    2 +-
 .../include/utils/vsi_nn_constraint_check.h   |    2 +-
 .../include/utils/vsi_nn_dtype_util_prv.h     |    1 +
 src/tim/vx/internal/include/vsi_nn_graph.h    |   12 +
 .../vx/internal/include/vsi_nn_node_type.h    |   12 +
 src/tim/vx/internal/include/vsi_nn_version.h  |    2 +-
 .../ops/kernel/cl/custom_softmax.cl}          |    4 +-
 .../ops/kernel/cpu/custom_softmax_cpu.c       |  194 +
 .../evis/custom_softmax.vx}                   |    7 +-
 .../ops/kernel/evis/custom_softmax_evis.c     |  202 +
 .../ops/kernel/vsi_nn_kernel_custom_softmax.c |  231 -
 .../src/custom/ops/op_custom_softmax.c        |  102 +
 .../src/custom/ops/vsi_nn_op_custom_softmax.c |  299 -
 src/tim/vx/internal/src/kernel/cl/argmax_cl.c |   21 +-
 src/tim/vx/internal/src/kernel/cl/argmin_cl.c |   20 +-
 src/tim/vx/internal/src/kernel/cl/cast_cl.c   |    2 +-
 .../internal/src/kernel/cl/comparisons_cl.c   |    6 +
 .../internal/src/kernel/cl/eltwise_unary_cl.c |   13 +-
 src/tim/vx/internal/src/kernel/cl/erf_cl.c    |  328 +
 .../vx/internal/src/kernel/cl/floordiv_cl.c   |    5 +-
 .../src/kernel/cl/group_normalization_cl.c    |  760 ++
 .../vx/internal/src/kernel/cl/moments_cl.c    |   88 +-
 .../vx/internal/src/kernel/cl/one_hot_cl.c    |  332 +
 .../src/kernel/cl/reducemax_internal_cl.c     |    8 +
 src/tim/vx/internal/src/kernel/cl/repeat_cl.c |  407 +
 .../internal/src/kernel/cl/sequence_mask_cl.c |  354 +
 src/tim/vx/internal/src/kernel/cl/slice_cl.c  |  308 +
 .../vx/internal/src/kernel/cpu/argmax_cpu.c   |    2 +-
 .../vx/internal/src/kernel/cpu/argmin_cpu.c   |    2 +-
 .../cpu/axis_aligned_bbox_transform_cpu.c     |  279 +
 .../src/kernel/cpu/batchnorm_single_cpu.c     |    2 +-
 .../internal/src/kernel/cpu/comparisons_cpu.c |    6 +-
 .../src/kernel/cpu/conv1d_ovxlib_cpu.c        |  264 +
 .../src/kernel/cpu/depth2space_internal_cpu.c |    2 +-
 .../src/kernel/cpu/eltwise_unary_cpu.c        |   14 +-
 src/tim/vx/internal/src/kernel/cpu/erf_cpu.c  |  229 +
 .../vx/internal/src/kernel/cpu/gather_cpu.c   |    2 +-
 .../internal/src/kernel/cpu/gather_nd_cpu.c   |    2 +-
 .../src/kernel/cpu/group_normalization_cpu.c  |  315 +
 .../kernel/cpu/instance_normalization_cpu.c   |    2 +-
 .../src/kernel/cpu/layer_normalization_cpu.c  |    6 +-
 .../internal/src/kernel/cpu/log_softmax_cpu.c |    2 +-
 .../internal/src/kernel/cpu/matrixmul_cpu.c   |    2 +-
 .../vx/internal/src/kernel/cpu/maximum_cpu.c  |    2 +-
 .../vx/internal/src/kernel/cpu/minimum_cpu.c  |    2 +-
 .../vx/internal/src/kernel/cpu/moments_cpu.c  |    2 +-
 src/tim/vx/internal/src/kernel/cpu/nms_cpu.c  |  441 +
 .../vx/internal/src/kernel/cpu/one_hot_cpu.c  |  252 +
 src/tim/vx/internal/src/kernel/cpu/pow_cpu.c  |    2 +-
 .../src/kernel/cpu/pre_process_bgra_cpu.c     |    2 +-
 .../src/kernel/cpu/pre_process_gray_cpu.c     |    2 +-
 .../src/kernel/cpu/pre_process_nv12_cpu.c     |    2 +-
 .../src/kernel/cpu/pre_process_rgb_cpu.c      |    2 +-
 .../src/kernel/cpu/pre_process_yuv420_cpu.c   |    2 +-
 .../src/kernel/cpu/pre_process_yuv444_cpu.c   |    2 +-
 .../vx/internal/src/kernel/cpu/prelu_cpu.c    |    2 +-
 .../src/kernel/cpu/random_multinomial_cpu.c   |    2 +-
 .../vx/internal/src/kernel/cpu/repeat_cpu.c   |  286 +
 .../internal/src/kernel/cpu/scatter_nd_cpu.c  |    2 +-
 .../src/kernel/cpu/sequence_mask_cpu.c        |  248 +
 .../vx/internal/src/kernel/cpu/slice_cpu.c    |  246 +
 .../src/kernel/cpu/space2depth_internal_cpu.c |    2 +-
 src/tim/vx/internal/src/kernel/cpu/tile_cpu.c |    2 +-
 src/tim/vx/internal/src/kernel/cpu/topk_cpu.c |  297 +
 .../src/kernel/evis/batchnorm_single_evis.c   |  152 +-
 .../vx/internal/src/kernel/evis/cast_evis.c   |    1 +
 .../src/kernel/evis/comparisons_evis.c        |    1 +
 .../src/kernel/evis/conv1d_ovxlib_evis.c      |  702 ++
 .../kernel/evis/depth2space_internal_evis.c   |  166 +-
 .../src/kernel/evis/depthwise_conv1d_evis.c   |   24 +-
 .../src/kernel/evis/eltwise_unary_evis.c      |   29 +
 .../vx/internal/src/kernel/evis/erf_evis.c    |  428 +
 .../vx/internal/src/kernel/evis/gather_evis.c |  109 +-
 .../internal/src/kernel/evis/gather_nd_evis.c |    9 +
 .../kernel/evis/group_normalization_evis.c    | 1219 +++
 .../kernel/evis/instance_normalization_evis.c |  355 +-
 .../kernel/evis/layer_normalization_evis.c    |  189 +-
 .../internal/src/kernel/evis/maximum_evis.c   |    2 +-
 .../internal/src/kernel/evis/minimum_evis.c   |    2 +-
 .../internal/src/kernel/evis/one_hot_evis.c   |  460 +
 .../src/kernel/evis/pre_process_rgb_evis.c    |   85 +-
 .../src/kernel/evis/pre_process_yuv420_evis.c |   11 +-
 .../src/kernel/evis/pre_process_yuv444_evis.c |   11 +-
 .../vx/internal/src/kernel/evis/repeat_evis.c |  609 ++
 .../src/kernel/evis/resize_bilinear_evis.c    |  232 +-
 .../src/kernel/evis/sequence_mask_evis.c      |  393 +
 .../vx/internal/src/kernel/evis/slice_evis.c  |  451 +
 .../vx/internal/src/kernel/evis/tile_evis.c   |   96 +-
 .../internal/src/kernel/vsi_nn_kernel_param.c |   41 +
 .../src/kernel/vsi_nn_kernel_selector.c       |   16 +
 src/tim/vx/internal/src/kernel/vx/erf_vx.c    |  216 +
 .../src/libnnext/ops/cl/eltwise_ops_helper.cl |   56 +
 .../src/libnnext/ops/cl/eltwise_unary.cl      |   10 +-
 .../vx/internal/src/libnnext/ops/cl/erf.cl    |  113 +
 .../internal/src/libnnext/ops/cl/floordiv.cl  |   84 +
 .../ops/cl/group_normalization_f32.cl         |  248 +
 .../ops/cl/group_normalization_i32.cl         |  278 +
 .../libnnext/ops/cl/group_normalization_u8.cl |  287 +
 .../internal/src/libnnext/ops/cl/matrixmul.cl |    4 +-
 .../internal/src/libnnext/ops/cl/one_hot.cl   |  130 +
 .../vx/internal/src/libnnext/ops/cl/repeat.cl |  176 +
 .../src/libnnext/ops/cl/sequence_mask.cl      |   72 +
 .../vx/internal/src/libnnext/ops/cl/slice.cl  |  144 +
 ...si_nn_kernel_axis_aligned_bbox_transform.c |  275 -
 .../kernel/vsi_nn_kernel_box_with_nms_limit.c |    2 +-
 .../ops/kernel/vsi_nn_kernel_extra_ending.c   |    2 +-
 .../kernel/vsi_nn_kernel_generate_proposals.c |  483 -
 .../vsi_nn_kernel_heatmap_max_keypoint.c      |    2 +-
 .../ops/kernel/vsi_nn_kernel_imageprocess.c   |    2 +-
 .../ops/kernel/vsi_nn_kernel_signalframe.c    |    2 +-
 .../vsi_nn_kernel_spatial_transformer.c       |    2 +-
 .../ops/kernel/vsi_nn_kernel_sync_host.c      |    2 +-
 .../kernel/vsi_nn_kernel_tensorstackconcat.c  |    2 +-
 .../libnnext/ops/kernel/vsi_nn_kernel_topk.c  |  266 -
 .../src/libnnext/ops/vx/batchnorm_single.vx   |    8 +-
 .../libnnext/ops/vx/batchnorm_single_f32.vx   |  267 +
 .../src/libnnext/ops/vx/conv1d_ovxlib.vx      |  151 +
 .../libnnext/ops/vx/conv1d_ovxlib_k1024.vx    |  167 +
 .../src/libnnext/ops/vx/depth2space_crd.vx    |  242 +-
 .../src/libnnext/ops/vx/eltwise_unary_2d.vx   |   19 +-
 .../src/libnnext/ops/vx/eltwise_unary_3d.vx   |   20 +-
 .../vx/internal/src/libnnext/ops/vx/erf.vx    |  174 +
 .../src/libnnext/ops/vx/gather_array.vx       |  157 +
 .../internal/src/libnnext/ops/vx/gather_nd.vx |   20 +-
 .../src/libnnext/ops/vx/gather_nd_2d.vx       |   20 +-
 .../src/libnnext/ops/vx/gather_nd_2d_mix.vx   |   10 +-
 .../src/libnnext/ops/vx/gather_nd_3d.vx       |   22 +-
 .../src/libnnext/ops/vx/gather_nd_3d_mix.vx   |   10 +-
 .../src/libnnext/ops/vx/gather_nd_mix.vx      |   10 +-
 .../ops/vx/group_normalization_f16.vx         |  306 +
 .../ops/vx/group_normalization_i16.vx         |  339 +
 .../libnnext/ops/vx/group_normalization_i8.vx |  317 +
 .../libnnext/ops/vx/group_normalization_u8.vx |  261 +
 .../ops/vx/group_normalization_u8_f16.vx      |  114 +
 .../ops/vx/instance_normalization_f16.vx      |   65 +-
 .../ops/vx/instance_normalization_i16.vx      |   72 +-
 .../ops/vx/instance_normalization_i8.vx       |  234 +-
 .../vx/instance_normalization_scale_f32.vx    |  285 +
 .../instance_normalization_scale_f32_bf16.vx  |  253 +
 .../instance_normalization_scale_f32_f16.vx   |  143 +
 .../ops/vx/instance_normalization_u8.vx       |  287 +-
 .../ops/vx/instance_normalization_u8_f16.vx   |  147 +
 .../ops/vx/layer_normalization_scale_f32.vx   |  275 +
 .../vx/layer_normalization_scale_f32_2d.vx    |  237 +
 .../vx/layer_normalization_scale_f32_bf16.vx  |  159 +
 .../internal/src/libnnext/ops/vx/one_hot.vx   |  205 +
 .../libnnext/ops/vx/pre_process_rgb_copy.vx   |   65 +-
 .../ops/vx/pre_process_yuv420_copy_u8.vx      |  108 +-
 .../ops/vx/pre_process_yuv444_copy_u8.vx      |  121 +-
 .../vx/internal/src/libnnext/ops/vx/repeat.vx |  224 +
 .../src/libnnext/ops/vx/repeat_axis1.vx       |  232 +
 .../ops/vx/resize_bilinear_U8_UP_2X.vx        |   65 -
 .../resize_bilinear_U8_half_pixel_centers.vx  |  229 +
 .../src/libnnext/ops/vx/sequence_mask.vx      |  150 +
 .../vx/internal/src/libnnext/ops/vx/slice.vx  |  239 +
 .../vx/internal/src/libnnext/ops/vx/tile.vx   |   37 +
 ...i_nn_kernel_axis_aligned_bbox_transform.vx |    8 -
 .../vx/vsi_nn_kernel_generate_proposals.vx    |    8 -
 .../libnnext/ops/vx/vsi_nn_kernel_header.vx   |   56 +
 .../src/libnnext/vsi_nn_libnnext_resource.c   | 8145 +++++++++++++++--
 .../{client => libnnext}/vsi_nn_vxkernel.c    |    5 +-
 src/tim/vx/internal/src/makefile.linux        |   16 +-
 .../vx/internal/src/ops/vsi_nn_op_argmaxmin.c |    1 +
 .../vsi_nn_op_axis_aligned_bbox_transform.c   |  192 +-
 .../internal/src/ops/vsi_nn_op_batch_norm.c   |  202 +-
 .../src/ops/vsi_nn_op_batchnorm_single.c      |   26 +-
 .../vsi_nn_op_bidirectional_sequence_lstm.c   |   10 +-
 .../vsi_nn_op_bidirectional_sequence_rnn.c    |    2 +-
 .../src/ops/vsi_nn_op_box_with_nms_limit.c    |    2 +-
 src/tim/vx/internal/src/ops/vsi_nn_op_cast.c  |   99 +-
 src/tim/vx/internal/src/ops/vsi_nn_op_ceil.c  |  110 +
 src/tim/vx/internal/src/ops/vsi_nn_op_clip.c  |    3 +-
 .../vx/internal/src/ops/vsi_nn_op_conv1d.c    |  183 +-
 .../vx/internal/src/ops/vsi_nn_op_conv2d.c    |    1 +
 src/tim/vx/internal/src/ops/vsi_nn_op_crop.c  |    2 +-
 .../internal/src/ops/vsi_nn_op_dataconvert.c  |  131 +-
 .../src/ops/vsi_nn_op_deconvolution.c         |   10 +-
 .../src/ops/vsi_nn_op_deconvolution1d.c       |   28 +-
 .../vx/internal/src/ops/vsi_nn_op_dropout.c   |    2 +-
 .../vx/internal/src/ops/vsi_nn_op_eltwise.c   |  114 +-
 .../src/ops/vsi_nn_op_eltwise_unary.c         |    1 +
 src/tim/vx/internal/src/ops/vsi_nn_op_erf.c   |  128 +
 .../internal/src/ops/vsi_nn_op_extra_ending.c |    2 +-
 .../vx/internal/src/ops/vsi_nn_op_floordiv.c  |    6 +-
 .../internal/src/ops/vsi_nn_op_fullconnect.c  |    7 +-
 .../internal/src/ops/vsi_nn_op_fullconnect2.c |    2 +-
 .../vx/internal/src/ops/vsi_nn_op_gather.c    |   14 +-
 .../vx/internal/src/ops/vsi_nn_op_gather_nd.c |    9 +-
 .../src/ops/vsi_nn_op_generate_proposals.c    |  205 +-
 .../src/ops/vsi_nn_op_grouped_conv1d.c        |  207 +
 .../src/ops/vsi_nn_op_grouped_conv2d.c        |    8 +-
 .../src/ops/vsi_nn_op_groupnormalize.c        |  297 +
 .../internal/src/ops/vsi_nn_op_gru_ovxlib.c   |   13 +-
 .../src/ops/vsi_nn_op_grucell_ovxlib.c        |   93 +-
 .../src/ops/vsi_nn_op_heatmap_max_keypoint.c  |    2 +-
 .../internal/src/ops/vsi_nn_op_imageprocess.c |    2 +-
 .../src/ops/vsi_nn_op_instancenormalize.c     |    9 +-
 .../vx/internal/src/ops/vsi_nn_op_interp.c    |    1 +
 .../src/ops/vsi_nn_op_l2normalizescale.c      |  133 +-
 .../src/ops/vsi_nn_op_layernormalize.c        |   27 +-
 .../internal/src/ops/vsi_nn_op_lstm_ovxlib.c  |    2 +-
 .../src/ops/vsi_nn_op_lstmunit_activation.c   |    8 +-
 .../src/ops/vsi_nn_op_lstmunit_ovxlib.c       |    8 +-
 .../vx/internal/src/ops/vsi_nn_op_matrixmul.c |    3 +
 .../vx/internal/src/ops/vsi_nn_op_moments.c   |  118 +-
 src/tim/vx/internal/src/ops/vsi_nn_op_nms.c   |  136 +
 .../vx/internal/src/ops/vsi_nn_op_one_hot.c   |  176 +
 src/tim/vx/internal/src/ops/vsi_nn_op_pool.c  |  276 +-
 .../src/ops/vsi_nn_op_poolwithargmax.c        |   14 +-
 .../internal/src/ops/vsi_nn_op_post_process.c |    2 +-
 .../internal/src/ops/vsi_nn_op_pre_process.c  |    2 +-
 .../src/ops/vsi_nn_op_pre_process_bgra.c      |    3 +-
 .../src/ops/vsi_nn_op_pre_process_gray.c      |    6 +-
 .../src/ops/vsi_nn_op_pre_process_nv12.c      |    4 +
 .../src/ops/vsi_nn_op_pre_process_rgb.c       |    6 +-
 .../src/ops/vsi_nn_op_pre_process_tensor.c    |    2 +-
 .../src/ops/vsi_nn_op_pre_process_yuv420.c    |    4 +
 .../src/ops/vsi_nn_op_pre_process_yuv444.c    |    4 +
 src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c |    2 +-
 .../src/ops/vsi_nn_op_quantized_16bit_lstm.c  |    2 +-
 .../vx/internal/src/ops/vsi_nn_op_reduce.c    |    2 +-
 .../src/ops/vsi_nn_op_reduce_internal.c       |    2 +
 .../src/ops/vsi_nn_op_reducesum_internal.c    |    2 +-
 .../src/ops/vsi_nn_op_relational_ops.c        |   15 +
 .../internal/src/ops/vsi_nn_op_relu_keras.c   |    2 +-
 .../src/ops/vsi_nn_op_relu_keras_internal.c   |    2 +-
 .../vx/internal/src/ops/vsi_nn_op_repeat.c    |  340 +
 .../vx/internal/src/ops/vsi_nn_op_resize.c    |    2 +-
 .../src/ops/vsi_nn_op_resize_internal.c       |    2 +-
 .../ops/vsi_nn_op_resize_nearest_internal.c   |    2 +-
 .../vx/internal/src/ops/vsi_nn_op_reverse.c   |  336 +-
 .../src/ops/vsi_nn_op_rnncell_ovxlib.c        |    2 +-
 .../vx/internal/src/ops/vsi_nn_op_roi_align.c |    2 +-
 src/tim/vx/internal/src/ops/vsi_nn_op_scale.c |    2 +-
 .../src/ops/vsi_nn_op_sequence_mask.c         |  176 +
 .../src/ops/vsi_nn_op_shufflechannel.c        |    2 +-
 .../internal/src/ops/vsi_nn_op_signalframe.c  |    2 +-
 src/tim/vx/internal/src/ops/vsi_nn_op_slice.c |  107 +-
 .../src/ops/vsi_nn_op_softmax_internal.c      |    2 +-
 .../internal/src/ops/vsi_nn_op_space2depth.c  |    2 +-
 .../src/ops/vsi_nn_op_spatial_transformer.c   |    2 +-
 src/tim/vx/internal/src/ops/vsi_nn_op_split.c |   28 +-
 src/tim/vx/internal/src/ops/vsi_nn_op_stack.c |    2 +-
 .../src/ops/vsi_nn_op_strided_slice.c         |  236 +-
 src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c  |   18 +-
 .../vx/internal/src/ops/vsi_nn_op_sync_host.c |    2 +-
 .../vsi_nn_op_tensor_add_mean_stddev_norm.c   |    2 +-
 .../src/ops/vsi_nn_op_tensorstackconcat.c     |    2 +-
 src/tim/vx/internal/src/ops/vsi_nn_op_topk.c  |  203 +-
 .../vsi_nn_op_unidirectional_sequence_rnn.c   |    2 +-
 .../vx/internal/src/ops/vsi_nn_op_unstack.c   |    2 +-
 .../vx/internal/src/ops/vsi_nn_op_upsample.c  |    2 +-
 .../src/utils/vsi_nn_code_generator.c         |    9 +
 .../src/utils/vsi_nn_constraint_check.c       |   19 +-
 src/tim/vx/internal/src/utils/vsi_nn_dtype.c  |    3 +
 src/tim/vx/internal/src/vsi_nn_graph.c        |  155 +-
 .../internal/src/vsi_nn_graph_optimization.c  |   21 +-
 .../vx/internal/src/vsi_nn_internal_node.c    |    3 +-
 src/tim/vx/internal/src/vsi_nn_node.c         |    2 +-
 .../internal/src/vsi_nn_node_attr_template.c  |    3 +
 .../vx/internal/src/vsi_nn_pre_post_process.c |   32 +-
 src/tim/vx/internal/src/vsi_nn_tensor.c       |   55 +-
 277 files changed, 30752 insertions(+), 4475 deletions(-)
 rename src/tim/vx/internal/include/{client => libnnext}/vsi_nn_vxkernel.h (100%)
 create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h
 create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_groupnormalize.h
 rename src/tim/vx/internal/include/{vsi_nn_post.h => ops/vsi_nn_op_nms.h} (84%)
 create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_one_hot.h
 create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_repeat.h
 create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_sequence_mask.h
 rename src/tim/vx/internal/src/{libnnext/ops/vx/vsi_nn_kernel_topk.vx => custom/ops/kernel/cl/custom_softmax.cl} (63%)
 create mode 100644 src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c
 rename src/tim/vx/internal/src/custom/ops/{vx/vsi_nn_kernel_custom_softmax.vx => kernel/evis/custom_softmax.vx} (87%)
 create mode 100644 src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c
 delete mode 100644 src/tim/vx/internal/src/custom/ops/kernel/vsi_nn_kernel_custom_softmax.c
 create mode 100644 src/tim/vx/internal/src/custom/ops/op_custom_softmax.c
 delete mode 100644 src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_softmax.c
 create mode 100644 src/tim/vx/internal/src/kernel/cl/erf_cl.c
 create mode 100644 src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c
 create mode 100644 src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
 create mode 100644 src/tim/vx/internal/src/kernel/cl/repeat_cl.c
 create mode 100644 src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c
 create mode 100644 src/tim/vx/internal/src/kernel/cl/slice_cl.c
 create mode 100644 src/tim/vx/internal/src/kernel/cpu/axis_aligned_bbox_transform_cpu.c
 create mode 100644 src/tim/vx/internal/src/kernel/cpu/conv1d_ovxlib_cpu.c
 create mode 100644 src/tim/vx/internal/src/kernel/cpu/erf_cpu.c
 create mode 100644 src/tim/vx/internal/src/kernel/cpu/group_normalization_cpu.c
 create mode 100644 src/tim/vx/internal/src/kernel/cpu/nms_cpu.c
 create mode 100644 src/tim/vx/internal/src/kernel/cpu/one_hot_cpu.c
 create mode 100644 src/tim/vx/internal/src/kernel/cpu/repeat_cpu.c
 create mode 100644 src/tim/vx/internal/src/kernel/cpu/sequence_mask_cpu.c
 create mode 100644 src/tim/vx/internal/src/kernel/cpu/slice_cpu.c
 create mode 100644 src/tim/vx/internal/src/kernel/cpu/topk_cpu.c
 create mode 100644 src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c
 create mode 100644 src/tim/vx/internal/src/kernel/evis/erf_evis.c
 create mode 100644 src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
 create mode 100644 src/tim/vx/internal/src/kernel/evis/one_hot_evis.c
 create mode 100644 src/tim/vx/internal/src/kernel/evis/repeat_evis.c
 create mode 100644 src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c
 create mode 100644 src/tim/vx/internal/src/kernel/evis/slice_evis.c
 create mode 100644 src/tim/vx/internal/src/kernel/vx/erf_vx.c
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/erf.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_f32.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_i32.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_u8.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/repeat.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/sequence_mask.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/slice.cl
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_axis_aligned_bbox_transform.c
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_generate_proposals.c
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_topk.c
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single_f32.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/conv1d_ovxlib.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/conv1d_ovxlib_k1024.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/erf.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8_f16.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_bf16.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_f16.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8_f16.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_2d.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_bf16.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/one_hot.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/repeat.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/repeat_axis1.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_UP_2X.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/sequence_mask.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/slice.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_axis_aligned_bbox_transform.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_generate_proposals.vx
 rename src/tim/vx/internal/src/{client => libnnext}/vsi_nn_vxkernel.c (98%)
 create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_ceil.c
 create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_erf.c
 create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c
 create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
 create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_nms.c
 create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c
 create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c
 create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_sequence_mask.c

diff --git a/src/tim/vx/internal/BUILD b/src/tim/vx/internal/BUILD
index 1803e76..97e4591 100644
--- a/src/tim/vx/internal/BUILD
+++ b/src/tim/vx/internal/BUILD
@@ -69,7 +69,9 @@ filegroup(
     name = "custom_srcs",
     srcs = glob([
         "src/custom/ops/*.c",
-        "src/custom/ops/kernel/*.c",
+        "src/custom/ops/kernel/evis/*.c",
+        "src/custom/ops/kernel/cl/*.c",
+        "src/custom/ops/kernel/cpu/*.c",
     ])
 )
 
@@ -128,7 +130,6 @@ cc_library(
         "include/quantization/vsi_nn_asymmetric_affine.h",
         "include/quantization/vsi_nn_dynamic_fixed_point.h",
         "include/quantization/vsi_nn_perchannel_symmetric_affine.h",
-        "include/client/vsi_nn_vxkernel.h",
         "include/interface/ops.def",
         "include/kernel/vsi_nn_kernel.h",
         "include/kernel/vsi_nn_gpu.h",
@@ -139,6 +140,7 @@ cc_library(
         "include/vsi_nn_error.h",
 
         # libnnext
+        "include/libnnext/vsi_nn_vxkernel.h",
         "include/libnnext/vx_lib_nnext.h",
         "include/libnnext/vsi_nn_libnnext_resource.h",
 
@@ -167,7 +169,6 @@ cc_library(
         "src/vsi_nn_daemon.c",
         "src/vsi_nn_graph_optimization.c",
         "src/vsi_nn_pre_post_process.c",
-        "src/client/vsi_nn_vxkernel.c",
         "src/utils/vsi_nn_link_list.c",
         "src/utils/vsi_nn_util.c",
         "src/utils/vsi_nn_math.c",
@@ -200,12 +201,10 @@ cc_library(
         "src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c",
         "src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c",
         "src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_topk.c",
         "src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_axis_aligned_bbox_transform.c",
         "src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c",
-        "src/libnnext/ops/kernel/vsi_nn_kernel_generate_proposals.c",
         "src/libnnext/vsi_nn_libnnext_resource.c",
+        "src/libnnext/vsi_nn_vxkernel.c",
     ] + [":kernel_srcs"]
       + [":operation_srcs"]
       + [":custom_srcs"],
diff --git a/src/tim/vx/internal/CMakeLists.txt b/src/tim/vx/internal/CMakeLists.txt
index f09bb9a..ee9fc3f 100644
--- a/src/tim/vx/internal/CMakeLists.txt
+++ b/src/tim/vx/internal/CMakeLists.txt
@@ -12,7 +12,6 @@ aux_source_directory(src/kernel/cpu INTERNAL_KERNEL_CPU)
 aux_source_directory(src/kernel/evis INTERNAL_KERNEL_EVIS)
 aux_source_directory(src/kernel/vx INTERNAL_KERNEL_VX)
 aux_source_directory(src/ops INTERNAL_OPS)
-aux_source_directory(src/client INTERNAL_CLIENT)
 aux_source_directory(src/libnnext INTERNAL_LIBNNEXT)
 aux_source_directory(src/libnnext/ops/kernel INTERNAL_LIBNNEXT_OPS_KERNEL)
 aux_source_directory(src/quantization INTERNAL_QUANTIZATION)
@@ -29,7 +28,6 @@ list(APPEND SRC
     ${INTERNAL_KERNEL_EVIS}
     ${INTERNAL_KERNEL_VX}
     ${INTERNAL_OPS}
-    ${INTERNAL_CLIENT}
     ${INTERNAL_LIBNNEXT}
     ${INTERNAL_LIBNNEXT_OPS_KERNEL}
     ${INTERNAL_QUANTIZATION}
diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def
index 523f299..6315513 100644
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@@ -147,3 +147,12 @@ DEF_OP(DECONVOLUTION1D)
 DEF_OP(INTERP)
 DEF_OP(RESIZE_1D)
 DEF_OP(UPSAMPLESCALE)
+DEF_OP(GROUP_NORM)
+DEF_OP(ROUND)
+DEF_OP(CEIL)
+DEF_OP(SEQUENCE_MASK)
+DEF_OP(REPEAT)
+DEF_OP(ERF)
+DEF_OP(ONE_HOT)
+DEF_OP(NMS)
+DEF_OP(GROUPED_CONV1D)
diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
index c5c8b2c..9d89a4a 100644
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
@@ -244,6 +244,12 @@ vsi_bool vsi_nn_kernel_param_add_buffer
 void * vsi_nn_kernel_param_get_buffer
     ( const vsi_nn_kernel_param_t * params, const char * key, size_t * size);
 
+vsi_bool vsi_nn_kernel_param_add_const_buffer
+    ( vsi_nn_kernel_param_t * params, const char * key, const void * buf, size_t size);
+
+const void * vsi_nn_kernel_param_get_const_buffer
+    ( const vsi_nn_kernel_param_t * params, const char * key, size_t * size);
+
 /** Kernel register */
 #define REGISTER_KERNEL_BACKEND(kernel_name, kernel_type, func)   \
         _INITIALIZER(_register_kernel_##kernel_name##_##kernel_type) \
diff --git a/src/tim/vx/internal/include/client/vsi_nn_vxkernel.h b/src/tim/vx/internal/include/libnnext/vsi_nn_vxkernel.h
similarity index 100%
rename from src/tim/vx/internal/include/client/vsi_nn_vxkernel.h
rename to src/tim/vx/internal/include/libnnext/vsi_nn_vxkernel.h
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_conv1d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_conv1d.h
index 9d216ff..5fa5041 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_conv1d.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_conv1d.h
@@ -30,17 +30,19 @@
 extern "C" {
 #endif
 
-typedef struct _vsi_nn_conv1d_lcl_data_t
-{
-    vx_tensor input_tensor;
-    vx_tensor weight_tensor;
-    vx_tensor output_tensor;
-} vsi_nn_conv1d_lcl_data_t;
-
 typedef struct _vsi_nn_conv1d_param
 {
     /* local data must be the first. */
-    vsi_nn_conv1d_lcl_data_t local;
+    union
+    {
+        struct _conv1d_local_data_t *local;
+
+        struct {
+            vx_tensor input_tensor;
+            vx_tensor weight_tensor;
+            vx_tensor output_tensor;
+        } reserved;
+    };
 
     uint32_t     ksize;
     uint32_t     stride;
@@ -53,6 +55,8 @@ typedef struct _vsi_nn_conv1d_param
     uint32_t     dilation;
     int32_t      multiplier;
 } vsi_nn_conv1d_param;
+_compiler_assert(offsetof(vsi_nn_conv1d_param, local) == 0, \
+    vsi_nn_vsi_nn_conv1d_h );
 
 #ifdef __cplusplus
 }
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h
new file mode 100644
index 0000000..f9470ee
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h
@@ -0,0 +1,55 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_GROUPED_CONV1D_H
+#define _VSI_NN_OP_GROUPED_CONV1D_H
+
+#include "vsi_nn_types.h"
+
+typedef struct _grouped_conv1d_local_data_t {
+    vsi_nn_tensor_t* input;
+    vsi_nn_tensor_t* weight;
+    vsi_nn_tensor_t* output;
+
+} grouped_conv1d_local_data_t;
+
+typedef struct _vsi_nn_grouped_conv1d_param
+{
+    grouped_conv1d_local_data_t *local;
+
+    uint32_t     ksize;
+    uint32_t     stride;
+    /* Pad left, right, top, bottom */
+    uint32_t     pad[2];
+    /* Pad type default value shall be AUTO */
+    vsi_nn_pad_e pad_type;
+    uint32_t     weights;
+    uint32_t     group;
+    uint32_t     dilation;
+    int32_t      multiplier;
+} vsi_nn_grouped_conv1d_param;
+
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_groupnormalize.h b/src/tim/vx/internal/include/ops/vsi_nn_op_groupnormalize.h
new file mode 100644
index 0000000..417a4cf
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_groupnormalize.h
@@ -0,0 +1,53 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef _VSI_NN_OP_CLIENT_GROUPNORMALIZE_H
+#define _VSI_NN_OP_CLIENT_GROUPNORMALIZE_H
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_platform.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_groupnorm_lcl_data
+{
+    /* handle 3D group norm */
+    vsi_nn_tensor_t *reshaped_input;
+    vsi_nn_tensor_t *reshaped_output;
+} vsi_nn_groupnorm_lcl_data;
+
+typedef struct _vsi_nn_groupnormalize_param
+{
+    /* local data must be the first. */
+    vsi_nn_groupnorm_lcl_data* lcl_data;
+    float eps;
+    int32_t group_num;
+} vsi_nn_groupnormalize_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_moments.h b/src/tim/vx/internal/include/ops/vsi_nn_op_moments.h
index c9f39ed..fd6427a 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_moments.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_moments.h
@@ -32,9 +32,9 @@ extern "C" {
 
 typedef struct _vsi_nn_moments_param
 {
-    int32_t*   axis;
-    int32_t    axis_num;
-    vsi_bool   keep_dim;
+    const int32_t*  axis;
+    int32_t         axis_num;
+    vsi_bool        keep_dim;
 } vsi_nn_moments_param;
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/include/vsi_nn_post.h b/src/tim/vx/internal/include/ops/vsi_nn_op_nms.h
similarity index 84%
rename from src/tim/vx/internal/include/vsi_nn_post.h
rename to src/tim/vx/internal/include/ops/vsi_nn_op_nms.h
index 61fe75f..174bb10 100644
--- a/src/tim/vx/internal/include/vsi_nn_post.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_nms.h
@@ -21,10 +21,18 @@
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
-#ifndef _VSI_NN_POST_H
-#define _VSI_NN_POST_H
 
-#include "post/vsi_nn_post_fasterrcnn.h"
-#include "post/vsi_nn_post_cmupose.h"
+#ifndef _VSI_NN_OP_NMS_H
+#define _VSI_NN_OP_NMS_H
 
-#endif
\ No newline at end of file
+#include "vsi_nn_types.h"
+
+typedef struct _vsi_nn_nms_param
+{
+    int32_t max_output_size;
+    float iou_threshold;
+    float score_threshold;
+    float soft_nms_sigma;
+} vsi_nn_nms_param;
+
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_one_hot.h b/src/tim/vx/internal/include/ops/vsi_nn_op_one_hot.h
new file mode 100644
index 0000000..5cad574
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_one_hot.h
@@ -0,0 +1,42 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_ONE_HOT_H
+#define _VSI_NN_OP_ONE_HOT_H
+
+#include "vsi_nn_types.h"
+
+typedef struct _vsi_nn_one_hot_param
+{
+    struct _one_hot_local_data_t* local;
+
+    int32_t depth;
+    float on_value;
+    float off_value;
+    int32_t axis;
+} vsi_nn_one_hot_param;
+_compiler_assert(offsetof(vsi_nn_one_hot_param, local) == 0, \
+    vsi_nn_one_hot_h );
+
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pool.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pool.h
index 979d22c..ee32df3 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pool.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pool.h
@@ -30,12 +30,12 @@
 extern "C" {
 #endif
 
-#define _VSI_NN_POOLWITHARGMAX_LOCAL_TENSOR_NUM 3
-
-typedef struct _vsi_nn_poolwithargmax_lcl_data
+typedef struct _vsi_nn_pool_lcl_data
 {
-    vx_tensor   local_tensor[_VSI_NN_POOLWITHARGMAX_LOCAL_TENSOR_NUM];
-} vsi_nn_poolwithargmax_lcl_data;
+    /* handle pool1d */
+    vsi_nn_tensor_t *reshaped_input;
+    vsi_nn_tensor_t *reshaped_output;
+} vsi_nn_pool_lcl_data;
 
 typedef struct _vsi_nn_pool_param
 {
@@ -49,7 +49,7 @@ typedef struct _vsi_nn_pool_param
     /* Pad type default value shall be AUTO */
     vsi_nn_pad_e pad_type;
     /* poolwithargmax layer local data structure */
-    vsi_nn_poolwithargmax_lcl_data local;
+    vsi_nn_pool_lcl_data *local;
 } vsi_nn_pool_param;
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_repeat.h b/src/tim/vx/internal/include/ops/vsi_nn_op_repeat.h
new file mode 100644
index 0000000..973570e
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_repeat.h
@@ -0,0 +1,54 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef _VSI_NN_OP_REPEAT_H
+#define _VSI_NN_OP_REPEAT_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_repeat_lcl_data
+{
+    vsi_nn_tensor_t *repeat_tensor;
+    vsi_nn_tensor_t *reshaped_input;
+    vsi_nn_tensor_t *reshaped_output;
+} vsi_nn_repeat_lcl_data;
+
+typedef struct _vsi_nn_repeat__param
+{
+    vsi_nn_repeat_lcl_data* local;
+    int32_t        axis;
+    int32_t        maxlen;        // default max repeat number
+    int32_t*       repeat_host;   // host repeat array
+    int32_t        repeat_len;    // length of host repeat array
+} vsi_nn_repeat_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_sequence_mask.h b/src/tim/vx/internal/include/ops/vsi_nn_op_sequence_mask.h
new file mode 100644
index 0000000..e8bb2f1
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_sequence_mask.h
@@ -0,0 +1,43 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef _VSI_NN_OP_SEQUENCE_MASK_H
+#define _VSI_NN_OP_SEQUENCE_MASK_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_sequence_mask__param
+{
+    int32_t        maxlen;
+} vsi_nn_sequence_mask_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_strided_slice.h b/src/tim/vx/internal/include/ops/vsi_nn_op_strided_slice.h
index ec5b6c9..d7bb3c7 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_strided_slice.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_strided_slice.h
@@ -32,6 +32,22 @@
 extern "C" {
 #endif
 
+typedef struct _strided_slice_param
+{
+    int32_t *begin_dims;
+    int32_t begin_dims_num;
+    int32_t *end_dims;
+    int32_t end_dims_num;
+    int32_t *stride_dims;
+    int32_t stride_dims_num;
+    int32_t begin_mask;
+    int32_t end_mask;
+    int32_t shrink_axis_mask;
+    int32_t new_axis_mask;
+
+    int32_t num_add_axis;
+} strided_slice_param;
+
 typedef struct _vsi_nn_strided_slice_lcl_data2
 {
     vsi_nn_link_list_t link_list;
@@ -55,6 +71,8 @@ typedef struct _vsi_nn_strided_slice_lcl_data2
 
     vsi_bool is_dataconvert_op;
     vsi_bool is_optimized;
+
+    strided_slice_param params;
 } vsi_nn_strided_slice_lcl_data2;
 
 typedef struct _vsi_nn_strided_slice_lcl_data_t
@@ -78,6 +96,7 @@ typedef struct _vsi_nn_strided_slice_param
     vx_int32 begin_mask;
     vx_int32 end_mask;
     vx_int32 shrink_axis_mask;
+    int32_t new_axis_mask;
 
     vsi_nn_strided_slice_lcl_data2  * lcl2_data;
 } vsi_nn_strided_slice_param;
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_upsample.h b/src/tim/vx/internal/include/ops/vsi_nn_op_upsample.h
index 112f633..ef191bf 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_upsample.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_upsample.h
@@ -34,7 +34,7 @@ extern "C" {
 
 typedef struct _vsi_nn_upsample_lcl_data
 {
-    vx_tensor   local_tensor[_VSI_NN_POOLWITHARGMAX_LOCAL_TENSOR_NUM];
+    vx_tensor   local_tensor[_VSI_NN_UPSAMPLE_LOCAL_TENSOR_NUM];
 } vsi_nn_upsample_lcl_data;
 
 typedef struct _vsi_nn_upsample_param
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h b/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h
index 3bb7c5d..a491adc 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h
@@ -119,7 +119,7 @@ vsi_bool is_item_in_array
 enum { NAME##_INPUT_COUNT = INPUT_COUNT,                            \
     NAME##_OUTPUT_COUNT = OUTPUT_COUNT,                             \
     NAME##_IO_COUNT = NAME##_INPUT_COUNT + NAME##_OUTPUT_COUNT};    \
-static const struct {vsi_nn_type_e types[NAME##_IO_COUNT];}         \
+static const struct {int types[NAME##_IO_COUNT];}         \
 NAME##_supported_io_types[] = {
 
 #define DECL_OP_CONSTRAINT_REG(NAME) \
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
index d85fafd..334c7a0 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
@@ -438,6 +438,7 @@ static inline vsi_status float32_to_dtype
     case VSI_NN_TYPE_UINT8:
     case VSI_NN_TYPE_INT16:
     case VSI_NN_TYPE_INT32:
+    case VSI_NN_TYPE_UINT32:
         {
             int32_t dst_value = 0;
             switch( dst_dtype->qnt_type )
diff --git a/src/tim/vx/internal/include/vsi_nn_graph.h b/src/tim/vx/internal/include/vsi_nn_graph.h
index 584bdd8..ffb5dd0 100644
--- a/src/tim/vx/internal/include/vsi_nn_graph.h
+++ b/src/tim/vx/internal/include/vsi_nn_graph.h
@@ -165,6 +165,8 @@ struct _vsi_nn_graph
          * so please keep it NULL.*/
         vsi_nn_tensor_t* tensor;
     } complete_signal;
+
+    vsi_bool isAllowFastMode;
 };
 
 /**
@@ -716,6 +718,16 @@ OVXLIB_API vsi_status vsi_nn_SetGraphPriority
     uint32_t priority
     );
 
+OVXLIB_API vsi_status vsi_nn_SetGraphFastMode
+    (
+    vsi_nn_graph_t* graph,
+    vsi_bool fastmode
+    );
+
+OVXLIB_API vsi_bool vsi_nn_IsGraphFastMode
+    (
+    const vsi_nn_graph_t* graph
+    );
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h
index 89cd104..f9a4606 100644
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@@ -164,6 +164,12 @@
 #include "ops/vsi_nn_op_resize_1d_bilinear_internal.h"
 #include "ops/vsi_nn_op_resize_1d_nearest_internal.h"
 #include "ops/vsi_nn_op_upsamplescale.h"
+#include "ops/vsi_nn_op_groupnormalize.h"
+#include "ops/vsi_nn_op_sequence_mask.h"
+#include "ops/vsi_nn_op_repeat.h"
+#include "ops/vsi_nn_op_one_hot.h"
+#include "ops/vsi_nn_op_nms.h"
+#include "ops/vsi_nn_op_grouped_conv1d.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"
 
@@ -314,6 +320,12 @@ typedef union _vsi_nn_nn_param
     vsi_nn_resize_1d_bilinear_internal_param resize_1d_bilinear_internal;
     vsi_nn_resize_1d_nearest_internal_param resize_1d_nearest_internal;
     vsi_nn_upsamplescale_param      upsamplescale;
+    vsi_nn_groupnormalize_param     groupnorm;
+    vsi_nn_sequence_mask_param      sequence_mask;
+    vsi_nn_repeat_param             repeat;
+    vsi_nn_one_hot_param            one_hot;
+    vsi_nn_nms_param                nms;
+    vsi_nn_grouped_conv1d_param     grouped_conv1d;
     uint8_t                         client_param[128];
 
     /* custom node data struct define */
diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h
index da62e48..5e544c2 100644
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@@ -33,7 +33,7 @@ extern "C"{
 
 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 1
-#define VSI_NN_VERSION_PATCH 30
+#define VSI_NN_VERSION_PATCH 32
 #define VSI_NN_VERSION \
     (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_topk.vx b/src/tim/vx/internal/src/custom/ops/kernel/cl/custom_softmax.cl
similarity index 63%
rename from src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_topk.vx
rename to src/tim/vx/internal/src/custom/ops/kernel/cl/custom_softmax.cl
index fdacd41..05587c8 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_topk.vx
+++ b/src/tim/vx/internal/src/custom/ops/kernel/cl/custom_softmax.cl
@@ -1,6 +1,4 @@
-#include "cl_viv_vx_ext.h"
-
-__kernel void vxcTopk(
+__kernel void testop(
     __read_only image2d_array_t   input,
     __write_only image2d_array_t  output)
 {
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c
new file mode 100644
index 0000000..abedba1
--- /dev/null
+++ b/src/tim/vx/internal/src/custom/ops/kernel/cpu/custom_softmax_cpu.c
@@ -0,0 +1,194 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include <stdlib.h>
+#include <math.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_platform.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_test.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
+//#include "libnnext/vx_lib_nnext.h"
+
+#define _CPU_ARG_NUM            (1)
+#define _CPU_INPUT_NUM          (1)
+#define _CPU_OUTPUT_NUM         (1)
+#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
+#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
+#define _KERNEL_NAME            ("com.vivantecorp.extension.CustomSoftmaxVXC")
+
+#define SCALAR_INPUT_AXIS          (2)
+
+__BEGIN_DECLS
+
+DEF_KERNEL_EXECUTOR(_softmax_exec)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t* param,
+    size_t param_size
+    )
+{
+    vsi_status status = VX_SUCCESS;
+    float* buffer[_CPU_IO_NUM] = { NULL };
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
+    vsi_nn_kernel_tensor_attr_t* attr[_CPU_IO_NUM] = { NULL };
+    uint32_t i = 0;
+    uint32_t out_elements;
+    int32_t sf_axis;
+    float fMax = 0.0;
+    float fProbSum = 0.0f;
+
+    tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
+    tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &sf_axis);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    out_elements = (uint32_t)vsi_nn_kernel_tensor_attr_get_size( attr[1] );
+
+    /* alloc the float32 data buffer */
+    buffer[1] = (float *)malloc(out_elements * sizeof(float));
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
+    memset(buffer[1], 0, out_elements * sizeof(float));
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
+
+    /* Softmax implement */
+    for ( i = 0; i < out_elements; i++)
+    {
+        fMax = buffer[0][i] > fMax ? buffer[0][i] : fMax;
+    }
+
+    for ( i = 0; i < out_elements; i++)
+    {
+        buffer[1][i] = (float)expf(buffer[0][i] - fMax);
+        fProbSum += buffer[1][i];
+    }
+    for ( i = 0; i < out_elements; i++)
+    {
+        buffer[1][i] =  buffer[1][i] / fProbSum;
+    }
+    status = vsi_nn_kernel_tensor_write_from_float(
+        tensors[1], attr[1], buffer[1], out_elements );
+
+final:
+    for( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+        vsi_nn_kernel_tensor_attr_release( &attr[i] );
+    }
+    return status;
+}
+
+static vx_param_description_t kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
+};
+
+static const vx_kernel_description_t _kernel_info =
+{
+    KERNEL_ID_PLACEHOLDER,
+    _KERNEL_NAME,
+    _softmax_exec,
+    kernel_param_def,
+    _cnt_of_array( kernel_param_def ),
+    vsi_nn_KernelValidator,
+    NULL,
+    NULL,
+    vsi_nn_KernelInitializer,
+    vsi_nn_KernelDeinitializer
+};
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel
+    )
+{
+    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    return VSI_SUCCESS;
+}
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t axis = 0;
+
+    axis = vsi_nn_kernel_param_get_int32(params, "axis");
+
+    status = _query_kernel( inputs, outputs, kernel );
+    if( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
+                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
+            backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &axis );
+
+            /* Pass parameters to node. */
+            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &backend_params[SCALAR_INPUT_AXIS] );
+        }
+        else
+        {
+            status = VSI_FAILURE;
+        }
+    }
+    return node;
+} /* _setup() */
+
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( custom_softmax, _setup )
diff --git a/src/tim/vx/internal/src/custom/ops/vx/vsi_nn_kernel_custom_softmax.vx b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax.vx
similarity index 87%
rename from src/tim/vx/internal/src/custom/ops/vx/vsi_nn_kernel_custom_softmax.vx
rename to src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax.vx
index fce529d..305f666 100644
--- a/src/tim/vx/internal/src/custom/ops/vx/vsi_nn_kernel_custom_softmax.vx
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax.vx
@@ -34,6 +34,7 @@ __kernel void Softmax2VXC
    }
 
     float  fProbSum = 0.0f;
+    vxc_short8 dst;
     for (int i = 0; i < sf_size; i++)
     {
        vxc_char8 val;
@@ -47,7 +48,8 @@ __kernel void Softmax2VXC
        fProbSum += fOut;
        half hVal;
        _viv_asm(CONV,hVal,fOut);
-       VXC_WriteImage2DArray(output, coord_in, hVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+       _viv_asm(COPY,dst,hVal, 4);
+       VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
     }
 
     for (int i = 0; i < sf_size; i++)
@@ -63,7 +65,8 @@ __kernel void Softmax2VXC
        float fOut =fval/fProbSum;
        half hVal;
        _viv_asm(CONV,hVal,fOut);
-       VXC_WriteImage2DArray(output, coord_in, hVal, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+       _viv_asm(COPY,dst,hVal, 4);
+       VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
     }
 
 }
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c
new file mode 100644
index 0000000..34d679b
--- /dev/null
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c
@@ -0,0 +1,202 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include <stdlib.h>
+#include <math.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_platform.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_test.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
+//#include "libnnext/vx_lib_nnext.h"
+
+#define _CPU_ARG_NUM            (1)
+#define _CPU_INPUT_NUM          (1)
+#define _CPU_OUTPUT_NUM         (1)
+#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
+#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
+#define _KERNEL_NAME            ("com.vivantecorp.extension.Softmax2VXC")
+
+#define SCALAR_INPUT_AXIS          (2)
+
+__BEGIN_DECLS
+
+static vx_param_description_t kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED}
+};
+#define _EVIS_PARAM_NUM          _cnt_of_array(kernel_param_def)
+
+DEF_KERNEL_INITIALIZER(_softmax_initializer)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t* param,
+    size_t param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    int sf_size = 0;
+    vsi_nn_kernel_tensor_attr_t* attr = NULL;
+    // Alignment with a power of two value.
+    gpu_param_t gpu_param = {
+        2,          // workdim
+        {0, 0, 0},  // global_offset: control the start location be processed in the image
+        {0, 0, 0},  // global_scale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // local_size: local group size in thread
+        {0, 0, 0}}; // global_size: image size in thread
+
+    attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    if (!attr)
+    {
+        VSILOGE("Query failure! at line");
+        return status;
+    }
+
+    sf_size  =  attr->shape->data[0];
+
+    gpu_param.global_offset[0] = 0;
+    gpu_param.global_offset[1] = 0;
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.local_size[0]    = 1;
+    gpu_param.local_size[1]    = 1;
+    gpu_param.global_size[0]   =
+        gpu_align_p2((1 + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0],
+                gpu_param.local_size[0]);
+    gpu_param.global_size[1]   =
+        gpu_align_p2((1 + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1],
+                gpu_param.local_size[1]);
+    {
+        gpu_dp_inst_t Uni4x4_Fp16ToFp32 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status = vsi_nn_kernel_gpu_add_param( node,
+                "Uni4x4_Fp16ToFp32", &Uni4x4_Fp16ToFp32 );
+        vsi_nn_kernel_gpu_add_param(node,
+                "sf_size", &sf_size);
+    }
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+    if(status != VSI_SUCCESS)
+    {
+        VSILOGE("Initializer  failure!");
+    }
+    if (attr) vsi_nn_kernel_tensor_attr_release( &attr );
+
+    return status;
+}
+
+static const vx_kernel_description_t _kernel_info =
+{
+    KERNEL_ID_PLACEHOLDER,
+    _KERNEL_NAME,
+    NULL,
+    kernel_param_def,
+    _cnt_of_array( kernel_param_def ),
+    vsi_nn_KernelValidator,
+    NULL,
+    NULL,
+    _softmax_initializer,
+    vsi_nn_KernelDeinitializer
+};
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel
+    )
+{
+    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+
+    vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+            "vsi_nn_kernel_header",
+            "custom_softmax" );
+    vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+            "custom_softmax" );
+    return VSI_SUCCESS;
+}
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t axis = 0;
+
+    axis = vsi_nn_kernel_param_get_int32(params, "axis");
+
+    status = _query_kernel( inputs, outputs, kernel );
+    if( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
+                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
+            backend_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &axis );
+
+            /* Pass parameters to node. */
+            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &backend_params[SCALAR_INPUT_AXIS] );
+        }
+        else
+        {
+            status = VSI_FAILURE;
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( custom_softmax, _setup )
diff --git a/src/tim/vx/internal/src/custom/ops/kernel/vsi_nn_kernel_custom_softmax.c b/src/tim/vx/internal/src/custom/ops/kernel/vsi_nn_kernel_custom_softmax.c
deleted file mode 100644
index 0230420..0000000
--- a/src/tim/vx/internal/src/custom/ops/kernel/vsi_nn_kernel_custom_softmax.c
+++ /dev/null
@@ -1,231 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-#include <stdlib.h>
-#include <math.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_platform.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_node.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_test.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "client/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-#define _VX_KERNEL_ID           VX_KERNEL_ID(CUSTOM_SOFTMAX)
-#define _VX_KERNEL_VAR_CPU      (vx_client_kernel_CUSTOM_SOFTMAX_CPU)
-#define _VX_KERNEL_VAR_VX       (vx_client_kernel_CUSTOM_SOFTMAX_VX)
-#define _VX_KERNEL_NAME         ("com.vivantecorp.extension.CustomSoftmaxVXC")
-#define _VX_KERNEL_FUNC_KERNEL  (vxCustomSoftmaxKernel)
-
-static vsi_status VX_CALLBACK vxCustomSoftmaxKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-    vsi_status status = VX_SUCCESS;
-    vx_tensor input = NULL,output = NULL;
-    float *f32_in_buffer = NULL,*f32_out_buffer=NULL;
-    vx_context context = NULL;
-    vsi_nn_tensor_attr_t in_attr,out_attr;
-    uint32_t i,in_elements,out_elements;
-    int32_t sf_axis;
-    float fMax = 0.0;
-    float  fProbSum = 0.0f;
-
-    context = vxGetContext((vx_reference)node);
-    input  = (vx_tensor)paramObj[0];
-    output = (vx_tensor)paramObj[1];
-    vxCopyScalar((vx_scalar)paramObj[2], &(sf_axis),VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-
-    /* Fill input & output attribute data struct */
-    status = vsi_nn_vxGetTensorAttr(input, &in_attr);
-    TEST_CHECK_STATUS(status, final);
-    status = vsi_nn_vxGetTensorAttr(output, &out_attr);
-    TEST_CHECK_STATUS(status, final);
-
-    in_elements = vsi_nn_vxGetTensorElementNum(&in_attr);
-    out_elements = vsi_nn_vxGetTensorElementNum(&out_attr);
-
-    /* alloc the float32 data buffer */
-    f32_in_buffer = (float *)malloc(in_elements * sizeof(float));
-    f32_out_buffer= (float *)malloc(out_elements * sizeof(float));
-    memset(f32_in_buffer, 0, in_elements * sizeof(float));
-    memset(f32_out_buffer, 0, out_elements * sizeof(float));
-
-    /* Copy tensor to buffer, and convert bufer to float32 format */
-    status = vsi_nn_vxConvertTensorToFloat32Data(
-        context, input, &in_attr, f32_in_buffer, in_elements * sizeof(float));
-    TEST_CHECK_STATUS(status, final);
-
-    /* Softmax implement */
-    for ( i = 0; i < out_elements; i++)
-    {
-        fMax = f32_in_buffer[i] > fMax ? f32_in_buffer[i] : fMax;
-    }
-
-    for ( i = 0; i < out_elements; i++)
-    {
-        f32_out_buffer[i] = (float)expf(f32_in_buffer[i] - fMax);
-        fProbSum += f32_out_buffer[i];
-    }
-    for ( i = 0; i < out_elements; i++)
-    {
-        f32_out_buffer[i] =  f32_out_buffer[i]/ fProbSum;
-    }
-    status = vsi_nn_vxConvertFloat32DataToTensor(
-        context, output, &out_attr, f32_out_buffer, out_elements * sizeof(float));
-
-final:
-    if(f32_in_buffer)free(f32_in_buffer);
-    if(f32_out_buffer)free(f32_out_buffer);
-    return status;
-}
-
-static vx_status VX_CALLBACK vxCustomSoftmaxInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    vx_uint32 paraNum
-    )
-{
-    vx_status status = VX_SUCCESS;
-    /*TODO: Add initial code for VX program*/
-    // Alignment with a power of two value.
-#define gcmALIGN(n, align) ((n) + ((align) - 1)) & ~((align) - 1)
-    vx_kernel_execution_parameters_t shaderParam = {
-        2,          // workdim
-        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
-        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
-        {0, 0, 0},  // localWorkSize: local group size in thread
-        {0, 0, 0}}; // globalWorkSize: image size in thread
-    int input_size[6] = {1, 1, 1, 1, 1, 1};
-    int sf_size;
-    uint32_t input_dims;
-    uint32_t i;
-    vsi_nn_tensor_attr_t input_attr;
-
-    memset(&input_attr, 0, sizeof(vsi_nn_tensor_attr_t));
-
-    status  = vsi_nn_vxGetTensorAttr((vx_tensor)paramObj[0], &input_attr);
-    if (status != VX_SUCCESS)
-    {
-        VSILOGE("vsi_nn_vxGetTensorAttr  failure! at line %d\n", __LINE__);
-        return status;
-    }
-
-    input_dims  = input_attr.dim_num;
-    for (i = 0; i < input_dims; i++)
-    {
-        input_size[i] = input_attr.size[i];
-    }
-
-    sf_size  =  input_size[0];
-
-    shaderParam.globalWorkOffset[0] = 0;
-    shaderParam.globalWorkOffset[1] = 0;
-    shaderParam.globalWorkScale[0]  = 1;
-    shaderParam.globalWorkScale[1]  = 1;
-    shaderParam.localWorkSize[0]    = 1;
-    shaderParam.localWorkSize[1]    = 1;
-    shaderParam.globalWorkSize[0]   =
-        gcmALIGN((1 + shaderParam.globalWorkScale[0] - 1) / shaderParam.globalWorkScale[0], shaderParam.localWorkSize[0]);
-    shaderParam.globalWorkSize[1]   =
-        gcmALIGN((1 + shaderParam.globalWorkScale[1] - 1) / shaderParam.globalWorkScale[1], shaderParam.localWorkSize[1]);
-    {
-        vx_uint32 Uni4x4_Fp16ToFp32[16] = {
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-        };
-
-        vxSetNodeUniform(nodObj, "Uni4x4_Fp16ToFp32", 1, Uni4x4_Fp16ToFp32);
-        vxSetNodeUniform(nodObj, "sf_size",  1, &sf_size);
-    }
-    status |= vxSetNodeAttribute(nodObj, VX_NODE_ATTRIBUTE_KERNEL_EXECUTION_PARAMETERS,
-        &shaderParam, sizeof(vx_kernel_execution_parameters_t));
-
-    if(status < 0)
-    {
-        VSILOGE("Initializer  failure!");
-    }
-
-    return status;
-}
-
-static vx_param_description_t s_params[] =
-{
-    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
-    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t _VX_KERNEL_VAR_CPU =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    _VX_KERNEL_FUNC_KERNEL,
-    s_params,
-    _cnt_of_array( s_params ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t _VX_KERNEL_VAR_VX =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    NULL,
-    s_params,
-    _cnt_of_array( s_params ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxCustomSoftmaxInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_CUSTOM_SOFTMAX_list[] =
-{
-    &_VX_KERNEL_VAR_CPU,
-    &_VX_KERNEL_VAR_VX,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c b/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c
new file mode 100644
index 0000000..3aa9835
--- /dev/null
+++ b/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c
@@ -0,0 +1,102 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include <stdlib.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_platform.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_log.h"
+#include "kernel/vsi_nn_kernel.h"
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_kernel_param_t * param = NULL;
+    vsi_nn_custom_softmax_param * p;
+    p = &(self->nn_param.custom_softmax);
+
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_int32( param, "axis", p->axis );
+
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+            "custom_softmax",
+            inputs, 1,
+            outputs, 1, param );
+
+    vsi_nn_kernel_param_release( &param );
+
+    return VSI_SUCCESS;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    /*TODO: Check params. */
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * node,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        memmove(outputs[0]->attr.size, inputs[0]->attr.size,
+            inputs[0]->attr.dim_num * sizeof(uint32_t));
+    }
+    return TRUE;
+} /* op_setup() */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ CUSTOM_SOFTMAX,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ 1,
+    /* output_num */ 1
+    );
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_softmax.c b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_softmax.c
deleted file mode 100644
index 215334e..0000000
--- a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_softmax.c
+++ /dev/null
@@ -1,299 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-#include <stdlib.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_platform.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_node.h"
-#include "vsi_nn_ops.h"
-#include "vsi_nn_log.h"
-#include "client/vsi_nn_vxkernel.h"
-
-#define _ARG_NUM            (1)
-#define _INPUT_NUM          (1)
-#define _OUTPUT_NUM         (1)
-#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
-#define _PARAM_NUM          (_ARG_NUM + _IO_NUM)
-
-extern vx_kernel_description_t * vx_kernel_CUSTOM_SOFTMAX_list[];
-
-static void _set_inputs_outputs
-    (
-    vx_reference * params,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    uint32_t i;
-    uint32_t cnt;
-
-    /* Set inputs */
-    cnt = 0;
-    for( i = 0; i < _INPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)inputs[i]->t;
-    }
-
-    /* Set outputs */
-    for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)outputs[i]->t;
-    }
-} /* _set_inputs_outputs() */
-
-static vsi_status _create_params
-    (
-    vsi_nn_node_t * node,
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    vsi_status status;
-    vx_context ctx;
-    vsi_nn_custom_softmax_param * p;
-    if( 0 == num )
-    {
-        return VSI_SUCCESS;
-    }
-    memset( params, 0, sizeof( vx_reference * ) * num );
-    p = &(node->nn_param.custom_softmax);
-    ctx = vxGetContext( (vx_reference)node->graph->g );
-    /* Init parameters */
-#define _SET_PARAM( i, type, arg ) do{ \
-    params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \
-    status = vxGetStatus( params[i] ); \
-    if( VSI_SUCCESS != status ) { \
-    goto set_param_error; \
-    } \
-    } while(0)
-    _SET_PARAM( 0, VX_TYPE_INT32, axis );
-#undef _SET_PARAM
-set_param_error:
-
-    return status;
-} /* _create_params */
-
-static void _release_params
-    (
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    uint32_t i;
-    vx_scalar scalar;
-    for( i = 0; i < num; i ++ )
-    {
-        scalar = (vx_scalar)params[i];
-        vxReleaseScalar( &scalar );
-    }
-} /* _release_params() */
-
-static vsi_status cpu_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_reference * args;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    return status;
-}
-
-static vsi_status vx_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_reference * args;
-    //vsi_nn_tensor_attr_t attr;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-
-    /*TODO: Add code if need to change your parameter*/
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-#if 0
-    memcpy(&attr, &(inputs[0]->attr), sizeof(vsi_nn_tensor_attr_t));
-    attr.size[0] = attr.size[0];
-    attr.size[1] = 1;
-    attr.dim_num = 2;
-    params[0] = (vx_reference)vxReshapeTensor(inputs[0]->t, (int32_t*)(attr.size), attr.dim_num);
-    params[1] = (vx_reference)vxReshapeTensor(outputs[0]->t, (int32_t*)(attr.size), attr.dim_num);
-#endif
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-#if 0
-    vxReleaseTensor((vx_tensor*)&params[0]);
-    vxReleaseTensor((vx_tensor*)&params[1]);
-#endif
-    return status;
-}
-
-static vsi_nn_op_compute_t op_compute_list[] =
-{
-    cpu_op_compute,
-    vx_op_compute,
-    NULL
-};
-
-static vsi_status op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status;
-    vsi_nn_kernel_info_t kernel_info;
-    char *path = NULL;
-
-    memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t));
-    status = VSI_FAILURE;
-    kernel_info.type = VX_KERNEL_TYPE_CPU;
-    kernel_info.kernel = vx_kernel_CUSTOM_SOFTMAX_list;
-    kernel_info.resource_num = 1;
-    kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *));
-    kernel_info.resource_name[0] = "vsi_nn_kernel_custom_softmax";
-    path = getenv("USER_VX_SOURCE_PATH");
-    if(path)
-    {
-        vsi_nn_VxResourceSetPath(path);
-    }
-
-    if( kernel_info.type == VX_KERNEL_TYPE_VX)
-    {
-        kernel_info.kernel_index = 1;
-        kernel_info.init_index = 1;
-    }
-    else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/
-    {
-        kernel_info.kernel_index = 0;
-        kernel_info.init_index = 0;
-    }
-
-    self->n = vsi_nn_RegisterClientKernelAndNewNode(
-        self->graph, &kernel_info);
-    if (kernel_info.resource_name)
-    {
-        free(kernel_info.resource_name);
-    }
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-    if (NULL != op_compute_list[kernel_info.init_index])
-    {
-        status = op_compute_list[kernel_info.init_index](self, inputs, outputs);
-    }
-
-    return status;
-} /* op_compute() */
-
-static vsi_bool op_check
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    /*TODO: Check input tensor shapes. */
-    return TRUE;
-} /* op_check() */
-
-static vsi_bool op_setup
-    (
-    vsi_nn_node_t * node,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    /* TODO: Compute output tensor shape. */
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
-    {
-        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
-        outputs[0]->attr.size[0] = inputs[0]->attr.size[0];
-        outputs[0]->attr.size[1] = inputs[0]->attr.size[1];
-        outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
-        outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
-    }
-    return TRUE;
-} /* op_setup() */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-/* Registrar */
-DEF_OP_REG
-    (
-    /* op_name    */ CUSTOM_SOFTMAX,
-    /* init       */ NULL,
-    /* compute    */ op_compute,
-    /* deinit     */ vsi_nn_op_common_deinit,
-    /* check      */ op_check,
-    /* setup      */ op_setup,
-    /* optimize   */ NULL,
-    /* input_num  */ _INPUT_NUM,
-    /* output_num */ _OUTPUT_NUM
-    );
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c
index 6311201..5855db8 100644
--- a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c
@@ -183,26 +183,31 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype;
     vsi_status status = VSI_FAILURE;
     uint32_t key;
-    int i;
+    int32_t i;
 
     input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    if(input_dtype == I8)
+    if (input_dtype == I8)
     {
         input_dtype = I32;
     }
 
+    if (output_dtype == I16)
+    {
+        output_dtype = I32;
+    }
+
     key = HASH_ARGMAX_KEY( axis, input_dtype, output_dtype, image_2d );
 
-    for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(kernel_map); i ++ )
     {
-        if( kernel_map[i].key == key )
+        if ( kernel_map[i].key == key )
         {
             break;
         }
     }
-    if( i < _cnt_of_array(kernel_map) )
+    if ( i < _cnt_of_array(kernel_map) )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
         kernel->info.parameters = kernel_param_def;
@@ -237,7 +242,7 @@ static vsi_nn_kernel_node_t _setup
 
     axis = vsi_nn_kernel_param_get_int32(params, "axis");
 
-    if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size,
                 inputs[0]->attr.dim_num )
      || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
                 outputs[0]->attr.dim_num )
@@ -250,11 +255,11 @@ static vsi_nn_kernel_node_t _setup
 
     image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
     status = _query_kernel( inputs, outputs, axis, image_2d, kernel );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
 
-        if( node )
+        if ( node )
         {
             vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
                     inputs, 1, outputs, 1 );
diff --git a/src/tim/vx/internal/src/kernel/cl/argmin_cl.c b/src/tim/vx/internal/src/kernel/cl/argmin_cl.c
index 7afa3b6..399e496 100644
--- a/src/tim/vx/internal/src/kernel/cl/argmin_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/argmin_cl.c
@@ -183,20 +183,26 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype;
     vsi_status status = VSI_FAILURE;
     uint32_t key;
-    int i;
+    int32_t i;
 
     input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (output_dtype == I16)
+    {
+        output_dtype = I32;
+    }
+
     key = HASH_ARGMIN_KEY( axis, input_dtype, output_dtype, image_2d );
 
-    for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(kernel_map); i ++ )
     {
-        if( kernel_map[i].key == key )
+        if ( kernel_map[i].key == key )
         {
             break;
         }
     }
-    if( i < _cnt_of_array(kernel_map) )
+    if ( i < _cnt_of_array(kernel_map) )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
         kernel->info.parameters = kernel_param_def;
@@ -231,7 +237,7 @@ static vsi_nn_kernel_node_t _setup
 
     axis = vsi_nn_kernel_param_get_int32(params, "axis");
 
-    if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size,
                 inputs[0]->attr.dim_num )
      || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
                 outputs[0]->attr.dim_num )
@@ -244,11 +250,11 @@ static vsi_nn_kernel_node_t _setup
 
     image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
     status = _query_kernel( inputs, outputs, axis, image_2d, kernel );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
 
-        if( node )
+        if ( node )
         {
             vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
                     inputs, 1, outputs, 1 );
diff --git a/src/tim/vx/internal/src/kernel/cl/cast_cl.c b/src/tim/vx/internal/src/kernel/cl/cast_cl.c
index d89849a..112cfca 100644
--- a/src/tim/vx/internal/src/kernel/cl/cast_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/cast_cl.c
@@ -186,7 +186,7 @@ static vsi_status _query_kernel
     {
         in_dtype = F32;
     }
-    else if ((I8 == in_dtype) || (I16 == in_dtype))
+    else if ((I8 == in_dtype) || (BOOL8 == in_dtype) || (I16 == in_dtype))
     {
         in_dtype = I32;
     }
diff --git a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
index 856042d..62bb0f4 100644
--- a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
@@ -289,6 +289,12 @@ static vsi_status _query_kernel
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE && output_dtype == I8)
+    {
+        output_dtype = BOOL8;
+    }
+
     key = HASH_COMPARISONS_KEY( operation, input0_dtype, input1_dtype, output_dtype, image_2d );
 
     for( i = 0; i < _cnt_of_array(_comparisons_cl_kernel_map); i ++ )
diff --git a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
index c0de129..6b0d6d5 100644
--- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
@@ -48,6 +48,7 @@ typedef enum
     UNARY_NEG,
     UNARY_HSIGMOID,
     UNARY_MISH,
+    UNARY_ROUND,
 } unary_type_e;
 
 /*
@@ -91,7 +92,8 @@ typedef enum
 #define ELU_OPERATION           elu
 #define NEG_OPERATION           neg
 #define HSIGMOID_OPERATION      hard_sigmoid
-#define MISH_OPERATION      mish
+#define MISH_OPERATION          mish
+#define ROUND_OPERATION         round
 
 static const struct {
         uint32_t key;
@@ -113,6 +115,8 @@ static const struct {
     TENSOR_UNARY_KERNELS_FLOAT(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, F16)
     TENSOR_UNARY_KERNELS_FLOAT(MISH_OPERATION,     UNARY_MISH,     F32, F32)
     TENSOR_UNARY_KERNELS_FLOAT(MISH_OPERATION,     UNARY_MISH,     F16, F16)
+    TENSOR_UNARY_KERNELS_FLOAT(ROUND_OPERATION,    UNARY_ROUND,    F32, F32)
+    TENSOR_UNARY_KERNELS_FLOAT(ROUND_OPERATION,    UNARY_ROUND,    F16, F16)
 
     TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION,      UNARY_SIN,      F32, F32)
     TENSOR_UNARY_KERNELS_FLOAT_2D(SIN_OPERATION,      UNARY_SIN,      F16, F16)
@@ -128,6 +132,8 @@ static const struct {
     TENSOR_UNARY_KERNELS_FLOAT_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, F16, F16)
     TENSOR_UNARY_KERNELS_FLOAT_2D(MISH_OPERATION,     UNARY_MISH,     F32, F32)
     TENSOR_UNARY_KERNELS_FLOAT_2D(MISH_OPERATION,     UNARY_MISH,     F16, F16)
+    TENSOR_UNARY_KERNELS_FLOAT_2D(ROUND_OPERATION,    UNARY_ROUND,    F32, F32)
+    TENSOR_UNARY_KERNELS_FLOAT_2D(ROUND_OPERATION,    UNARY_ROUND,    F16, F16)
 
     TENSOR_UNARY_KERNELS(SIN_OPERATION,      UNARY_SIN,      U8,  U8)
     TENSOR_UNARY_KERNELS(EXP_OPERATION,      UNARY_EXP,      U8,  U8)
@@ -136,6 +142,7 @@ static const struct {
     TENSOR_UNARY_KERNELS(NEG_OPERATION,      UNARY_NEG,      U8,  U8)
     TENSOR_UNARY_KERNELS(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,  U8)
     TENSOR_UNARY_KERNELS(MISH_OPERATION,     UNARY_MISH,     U8,  U8)
+    TENSOR_UNARY_KERNELS(ROUND_OPERATION,    UNARY_ROUND,    U8,  U8)
 
     TENSOR_UNARY_KERNELS_2D(SIN_OPERATION,      UNARY_SIN,      U8,  U8)
     TENSOR_UNARY_KERNELS_2D(EXP_OPERATION,      UNARY_EXP,      U8,  U8)
@@ -144,6 +151,7 @@ static const struct {
     TENSOR_UNARY_KERNELS_2D(NEG_OPERATION,      UNARY_NEG,      U8,  U8)
     TENSOR_UNARY_KERNELS_2D(HSIGMOID_OPERATION, UNARY_HSIGMOID, U8,  U8)
     TENSOR_UNARY_KERNELS_2D(MISH_OPERATION,     UNARY_MISH,     U8,  U8)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION,    UNARY_ROUND,    U8,  U8)
 
     TENSOR_UNARY_KERNELS(NEG_OPERATION, UNARY_NEG, I32,  I32)
 
@@ -157,6 +165,7 @@ static const struct {
 #undef NEG_OPERATION
 #undef HSIGMOID_OPERATION
 #undef MISH_OPERATION
+#undef ROUND_OPERATION
 /*
  * Kernel params
  */
@@ -407,5 +416,5 @@ REGISTER_ELTWISE_UNARY_BACKEND_CL( elu,          UNARY_ELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( neg,          UNARY_NEG )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_sigmoid, UNARY_HSIGMOID )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( mish,         UNARY_MISH )
-
+REGISTER_ELTWISE_UNARY_BACKEND_CL( round,        UNARY_ROUND )
 __END_DECLS
diff --git a/src/tim/vx/internal/src/kernel/cl/erf_cl.c b/src/tim/vx/internal/src/kernel/cl/erf_cl.c
new file mode 100644
index 0000000..e817d19
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/erf_cl.c
@@ -0,0 +1,328 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define HASH_UNARY_KEY(_input_type, _output_type, _image_2d) \
+    ( (_input_type << 12) | (_output_type << 4) | (_image_2d))
+
+ #define VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() \
+    "erf"
+
+#define HASH_UNARY_SH_KERNEL_NAME( SRC_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("cl.erf_"#SRC_TYPE"to"#DST_TYPE)
+
+#define TENSOR_UNARY_KERNELS(SRC_TYPE, OUT_TYPE) \
+    {   HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 0), \
+        HASH_UNARY_SH_KERNEL_NAME(SRC_TYPE, OUT_TYPE), \
+        VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() },
+
+#define HASH_UNARY_SH_KERNEL_2D_NAME(SRC_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("cl.erf_"#SRC_TYPE"to"#DST_TYPE"_2D")
+
+#define TENSOR_UNARY_KERNELS_2D(SRC_TYPE, OUT_TYPE) \
+    {   HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 1), \
+        HASH_UNARY_SH_KERNEL_2D_NAME(SRC_TYPE, OUT_TYPE), \
+        VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() },
+
+#define TENSOR_UNARY_KERNELS_FLOAT(SRC_TYPE, OUT_TYPE) \
+    {   HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 0), \
+        HASH_UNARY_SH_KERNEL_NAME(F32, F32), \
+        VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() },
+
+#define TENSOR_UNARY_KERNELS_FLOAT_2D(SRC_TYPE, OUT_TYPE) \
+    {   HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 1), \
+        HASH_UNARY_SH_KERNEL_2D_NAME(F32, F32), \
+        VSI_NN_GEN_UNARY_KERNEL_SOURCE_NAME() },
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _erf_kernel_map[] =
+{
+    // Register kernel here
+    TENSOR_UNARY_KERNELS_FLOAT(F32, F32)
+    TENSOR_UNARY_KERNELS_FLOAT(F16, F16)
+
+    TENSOR_UNARY_KERNELS_FLOAT_2D(F32, F32)
+    TENSOR_UNARY_KERNELS_FLOAT_2D(F16, F16)
+
+    TENSOR_UNARY_KERNELS(U8,  U8)
+
+    TENSOR_UNARY_KERNELS_2D(U8,  U8)
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _erf_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define SCALAR_INPUT_SCALE           (2)
+#define SCALAR_INPUT_TAIL            (3)
+#define SCALAR_OUTPUT_SCALE          (4)
+#define SCALAR_OUTPUT_ZP             (5)
+#define _ERF_PARAM_NUM  _cnt_of_array( _erf_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_erf_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        3,         // workdim
+        {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0}, // localWorkSize: local group size in thread
+        {0, 0, 0}  // globalWorkSize: image size in thread
+        };
+
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+    vsi_int_array_t * out_shape = NULL;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    out_shape  = attr[1]->shape;
+
+    gpu_param.global_scale[0] = 1;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+    gpu_param.global_size[0] = gpu_align_p2(
+            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(attr[0]);
+    SAFE_FREE_TENSOR_ATTR(attr[1]);
+#undef SAFE_FREE_TENSOR_ATTR
+
+    return status;
+} /* _erf_initializer() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    vsi_bool image_2d
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _erf_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _erf_kernel_map );
+    vx_param_description_t * param_def  = _erf_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _erf_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = HASH_UNARY_KEY( in_dtype, out_dtype, image_2d );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _erf_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_ERF_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_tensor_t* rs_tensors[2] = { NULL };
+    int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+    int32_t new_rank = 0;
+    vsi_bool ret = FALSE;
+    vsi_bool image_2d = FALSE;
+
+    float inputScale = inputs[0]->attr.dtype.scale;
+    float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale;
+    float outputScale = outputs[0]->attr.dtype.scale;
+    float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f;
+
+    ret = vsi_nn_kernel_optimize_element_shape(
+            (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            shape, &new_rank );
+    if ( ret )
+    {
+        rs_tensors[0] = vsi_nn_reshape_tensor( graph,
+                inputs[0], (uint32_t*)shape, new_rank );
+        rs_tensors[1] = vsi_nn_reshape_tensor( graph,
+                outputs[0], (uint32_t*)shape, new_rank );
+    }
+
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[0]->attr.size,
+                rs_tensors[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
+
+    image_2d = (rs_tensors[0]->attr.dim_num == 2 || rs_tensors[0]->attr.size[2] == 1);
+
+    status = _query_kernel( kernel, inputs, outputs, image_2d );
+    if ( VSI_SUCCESS == status )
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _ERF_PARAM_NUM,
+                    rs_tensors, 1, &rs_tensors[1], 1 );
+
+            node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &inputScale );
+            node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &inputTail );
+            node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &outputScale );
+            node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &outputZP );
+
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _ERF_PARAM_NUM );
+            CHECK_STATUS_FAIL_GOTO( status, OnError );
+        }
+    }
+
+OnError:
+    if (rs_tensors[0])
+    {
+        vsi_nn_ReleaseTensor( &rs_tensors[0] );
+    }
+
+    if (rs_tensors[1])
+    {
+        vsi_nn_ReleaseTensor( &rs_tensors[1] );
+    }
+
+    if (node_params[SCALAR_INPUT_SCALE])
+    {
+        vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
+    }
+
+    if (node_params[SCALAR_INPUT_TAIL])
+    {
+        vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
+    }
+
+    if (node_params[SCALAR_OUTPUT_SCALE])
+    {
+        vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
+    }
+
+    if (node_params[SCALAR_OUTPUT_ZP])
+    {
+        vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( erf, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c
index 831e27c..a500383 100644
--- a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c
@@ -68,11 +68,15 @@ static const _kernel_map_type _floordiv_kernel_map[] =
     // Register kernel here
     FLOORDIV_KERNELS( F32, F32, F32 )
     FLOORDIV_KERNELS( I32, I32, I32 )
+    FLOORDIV_KERNELS( I32, I32, U8 )
     FLOORDIV_KERNELS( U8,  U8,  U8 )
+    FLOORDIV_KERNELS( U8,  I32, U8 )
 
     FLOORDIV_KERNELS_2D( F32, F32, F32 )
     FLOORDIV_KERNELS_2D( I32, I32, I32 )
+    FLOORDIV_KERNELS_2D( I32, I32, U8 )
     FLOORDIV_KERNELS_2D( U8,  U8,  U8 )
+    FLOORDIV_KERNELS_2D( U8,  I32, U8 )
 };
 
 
@@ -311,4 +315,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( floordiv, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c
new file mode 100644
index 0000000..f4ecf0e
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c
@@ -0,0 +1,760 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    INTERNAL_KERNEL_SUM_SQR,
+    INTERNAL_KERNEL_MEAN_VARI,
+    INTERNAL_KERNEL_NORM,
+} _internal_kernel_e;
+
+#define KERNEL_SOURCE_1    "group_normalization_u8"
+#define KERNEL_SOURCE_2    "group_normalization_f32"
+#define KERNEL_SOURCE_3    "group_normalization_i32"
+
+// Add kernel hashtable here
+#define HASH_GROUPNORM_SUM_SQR_KERNEL_NAME(SRC0_TYPE) \
+    CVIVANTE_NAMESPACE("cl.group_norm_sumsqr_"#SRC0_TYPE)
+
+#define HASH_GROUPNORM_SUM_SQR_KERNEL_2D_NAME(SRC0_TYPE) \
+    CVIVANTE_NAMESPACE("cl.group_norm_sumsqr_"#SRC0_TYPE"_2D")
+
+#define HASH_GROUPNORM_MEAN_VARI_KERNEL_NAME \
+    CVIVANTE_NAMESPACE("cl.group_norm_meanvari")
+
+#define HASH_GROUPNORM_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("cl.group_norm_"#SRC0_TYPE"to"#DST_TYPE)
+
+#define HASH_GROUPNORM_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("cl.group_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D")
+
+// Add kernel hashtable here
+// sum sqr
+#define HASH_GROUPNORM_SUM_SQR_KEY(_input0_type, _output_type, _reshape_flag) \
+    ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
+
+#define TENSOR_GROUPNORM_SUM_SQR_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GROUPNORM_SUM_SQR_KEY(IN0_TYPE, OUT_TYPE, 0), \
+        HASH_GROUPNORM_SUM_SQR_KERNEL_NAME(IN0_TYPE), \
+        SOURCE },
+
+#define TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GROUPNORM_SUM_SQR_KEY(IN0_TYPE, OUT_TYPE, 1), \
+        HASH_GROUPNORM_SUM_SQR_KERNEL_2D_NAME(IN0_TYPE), \
+        SOURCE },
+
+#define HASH_GROUPNORM_MEAN_VARI_KEY(_input0_type, _output_type) \
+    ((_input0_type << 24) | (_output_type << 16))
+
+#define TENSOR_GROUPNORM_MEAN_VARI_KERNELS(SOURCE) \
+    { HASH_GROUPNORM_MEAN_VARI_KEY(F32, F32), \
+        HASH_GROUPNORM_MEAN_VARI_KERNEL_NAME, \
+        SOURCE },
+
+// normalization
+#define HASH_GROUPNORM_KEY(_input0_type, _output_type, _reshape_flag) \
+    ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
+
+#define TENSOR_GROUPNORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GROUPNORM_KEY(IN0_TYPE, OUT_TYPE, 0), \
+        HASH_GROUPNORM_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
+#define TENSOR_GROUPNORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GROUPNORM_KEY(IN0_TYPE, OUT_TYPE, 1), \
+        HASH_GROUPNORM_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _groupnorm_sum_sqr_kernel_map[] =
+{
+    // Register kernel here
+    TENSOR_GROUPNORM_SUM_SQR_KERNELS( U8, F32, KERNEL_SOURCE_1 )
+    TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( U8, F32, KERNEL_SOURCE_1 )
+    TENSOR_GROUPNORM_SUM_SQR_KERNELS( F32, F32, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( F32, F32, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SUM_SQR_KERNELS( I32, F32, KERNEL_SOURCE_3 )
+    TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 )
+};
+
+static const _kernel_map_type _groupnorm_mean_vari_kernel_map[] =
+{
+    // Register kernel here
+    TENSOR_GROUPNORM_MEAN_VARI_KERNELS( KERNEL_SOURCE_2 )
+};
+
+static const _kernel_map_type _groupnorm_kernel_map[] =
+{
+    // Register kernel here
+    TENSOR_GROUPNORM_KERNELS( U8, U8, KERNEL_SOURCE_1 )
+    TENSOR_GROUPNORM_KERNELS_2D( U8, U8, KERNEL_SOURCE_1 )
+    TENSOR_GROUPNORM_KERNELS( U8, F32, KERNEL_SOURCE_1 )
+    TENSOR_GROUPNORM_KERNELS_2D( U8, F32, KERNEL_SOURCE_1 )
+
+    TENSOR_GROUPNORM_KERNELS( F32, F32, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_KERNELS_2D( F32, F32, KERNEL_SOURCE_2 )
+
+    TENSOR_GROUPNORM_KERNELS( I32, I32, KERNEL_SOURCE_3 )
+    TENSOR_GROUPNORM_KERNELS_2D( I32, I32, KERNEL_SOURCE_3 )
+    TENSOR_GROUPNORM_KERNELS( I32, F32, KERNEL_SOURCE_3 )
+    TENSOR_GROUPNORM_KERNELS_2D( I32, F32, KERNEL_SOURCE_3 )
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _groupnorm_sum_sqr_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _GROUPNORM_SUM_SQR_PARAM_NUM  _cnt_of_array( _groupnorm_sum_sqr_kernel_param_def )
+
+static vx_param_description_t _groupnorm_mean_vari_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _GROUPNORM_MEAN_VARI_PARAM_NUM  _cnt_of_array( _groupnorm_mean_vari_kernel_param_def )
+
+static vx_param_description_t _groupnorm_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _GROUPNORM_PARAM_NUM  _cnt_of_array( _groupnorm_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+    vsi_int_array_t * input_shape = NULL;
+    int32_t width = 0;
+    int32_t chn = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    input_shape  = attr[0]->shape;
+    width = input_shape->data[0];
+    chn = attr[1]->shape->data[1];
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+    gpu_param.local_size[0]  = 16;
+    gpu_param.local_size[1]  = 1;
+    gpu_param.local_size[2]  = 1;
+    gpu_param.global_size[0]   = (width + 15) / 16 * 16;
+    gpu_param.global_size[1]   = chn;
+    gpu_param.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+    return status;
+} /* _group_normalization_sum_sqr_initializer() */
+
+DEF_KERNEL_INITIALIZER(_groupnorm_mean_vari_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    int32_t chn = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+
+    chn = attr[0]->shape->data[1];
+
+    gpu_param.global_scale[0]  = 4;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+    gpu_param.local_size[0]  = 16;
+    gpu_param.local_size[1]  = 1;
+    gpu_param.local_size[2]  = 1;
+    gpu_param.global_size[0]   = 16;
+    gpu_param.global_size[1]   = chn;
+    gpu_param.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _group_normalization_sum_sqr_initializer() */
+
+DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+    vsi_int_array_t * input_shape = NULL;
+    int32_t width = 0;
+    int32_t height = 0;
+    int32_t chn = 0;
+    int32_t is2D = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &is2D);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    input_shape  = attr[0]->shape;
+    width = input_shape->data[0];
+    height = input_shape->data[1];
+    chn = attr[1]->shape->data[1];
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+    gpu_param.local_size[0]  = 16;
+    gpu_param.local_size[1]  = 1;
+    gpu_param.local_size[2]  = 1;
+    gpu_param.global_size[0]   = (width + 15) / 16 * 16;
+    gpu_param.global_size[1]   = height;
+    gpu_param.global_size[2]   = chn;
+    if (is2D)
+    {
+        gpu_param.global_size[0]   = (width + 15) / 16 * 16;
+        gpu_param.global_size[1]   = chn;
+        gpu_param.global_size[2]   = 1;
+    }
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+    return status;
+} /* _groupnorm_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    const uint32_t hashkey,
+    _internal_kernel_e kernel_id
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vx_kernel_initialize_f  initializer = NULL;
+    vx_param_description_t * param_def = NULL;
+    const _kernel_map_type* kernel_map;
+    size_t kernel_map_size = 0;
+    size_t param_size = 0;
+    uint32_t i = 0;
+
+    switch( kernel_id )
+    {
+        case INTERNAL_KERNEL_SUM_SQR:
+            initializer = _groupnorm_sum_sqr_initializer;
+            kernel_map = _groupnorm_sum_sqr_kernel_map;
+            kernel_map_size = _cnt_of_array( _groupnorm_sum_sqr_kernel_map );
+            param_def = _groupnorm_sum_sqr_kernel_param_def;
+            param_size = _GROUPNORM_SUM_SQR_PARAM_NUM;
+            break;
+        case INTERNAL_KERNEL_MEAN_VARI:
+            initializer = _groupnorm_mean_vari_initializer;
+            kernel_map = _groupnorm_mean_vari_kernel_map;
+            kernel_map_size = _cnt_of_array( _groupnorm_mean_vari_kernel_map );
+            param_def = _groupnorm_mean_vari_kernel_param_def;
+            param_size = _GROUPNORM_MEAN_VARI_PARAM_NUM;
+            break;
+        case INTERNAL_KERNEL_NORM:
+            initializer = _groupnorm_initializer;
+            kernel_map = _groupnorm_kernel_map;
+            kernel_map_size = _cnt_of_array( _groupnorm_kernel_map );
+            param_def = _groupnorm_kernel_param_def;
+            param_size = _GROUPNORM_PARAM_NUM;
+            break;
+        default:
+            VSI_ASSERT( FALSE );
+            return VSI_FAILURE;
+    }
+
+    for( i = 0; i < kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == hashkey )
+        {
+            break;
+        }
+    }
+    if ( i < kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+static int32_t _optimize_gn_shape_cl
+    (
+    vsi_nn_tensor_t ** inputs,
+    int32_t group_size,
+    int32_t group_num,
+    int32_t* opt_shape,
+    int32_t* is2D_flg
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    int32_t group_shape[VSI_NN_MAX_DIM_NUM] = {0};
+    int32_t new_rank = 0;
+    group_shape[0] = inputs[0]->attr.size[0];
+    group_shape[1] = inputs[0]->attr.size[1];
+    group_shape[2] = group_size;
+
+    vsi_nn_kernel_optimize_element_shape(group_shape, 3, opt_shape, &new_rank );
+
+    if (opt_shape[1] == 1)
+    {
+        opt_shape[1] = group_num;
+        opt_shape[2] = 1;
+        opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
+        is2D_flg[0] = 1;
+    }
+    else if (new_rank == 2)
+    {
+        opt_shape[2] = group_num;
+        opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
+    }
+    else
+    {
+        status = VSI_FAILURE;
+    }
+
+    return status;
+}
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+#define INTERNAL_KERNEL_SIZE    (2)
+#define SUM_SQR_INDEX           (0)
+#define MEAN_VARI_INDEX         (1)
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t sum_sqr_node_params[_GROUPNORM_SUM_SQR_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_param_t mean_vari_node_params[_GROUPNORM_MEAN_VARI_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_param_t node_params[_GROUPNORM_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_t tmp_node = NULL, tmp_node1 = NULL;
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_kernel_dtype_e in0_dtype = U8;
+    vsi_nn_kernel_dtype_e out_dtype = U8;
+    vsi_nn_tensor_attr_t attr;
+    vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL };
+    vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL };
+    vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
+    int32_t new_shape[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 };
+    int32_t is2D_flg = 0;
+    uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 };
+    uint32_t hashkey = 0;
+    int32_t i = 0;
+    float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
+    int32_t group_num  = vsi_nn_kernel_param_get_int32( params, "group_num" );
+    int32_t group_size  = inputs[0]->attr.size[2] / group_num;
+
+    int32_t width = inputs[0]->attr.size[0];
+    int32_t height = inputs[0]->attr.size[1];
+    int32_t group_stride = 1;
+    float input_zp = 0;
+    float input_scale = 1.0f;
+    int32_t input_fl = 0;
+    float output_zp = 0;
+    float output_scale = 1.0f;
+    int32_t output_fl = 0;
+    float rSpaceOrg = 1.0f / (width * height);
+    float group_ratio = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] * group_size);
+
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _optimize_gn_shape_cl(inputs, group_size, group_num, new_shape, &is2D_flg);
+    if ( VSI_SUCCESS != status )
+    {
+        goto final;
+    }
+    rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape, 4);
+    rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape, 4);
+
+    width = new_shape[0];
+    height = is2D_flg > 0 ? 1 : new_shape[1];
+    group_stride = ((width + 15) / 16) * 4;
+
+    if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
+    {
+        input_zp = (float)inputs[0]->attr.dtype.zero_point;
+        input_scale = inputs[0]->attr.dtype.scale;
+    }
+    else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
+    {
+        input_fl = inputs[0]->attr.dtype.fl;
+        if (input_fl > 0)
+        {
+            input_scale = (1.0f / ((float) ((int64_t)1 << input_fl)));
+        }
+        else
+        {
+            input_scale = ((float) ((int64_t)1 << -input_fl));
+        }
+        input_zp = 0.0f;
+    }
+
+    if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
+    {
+        output_zp = (float)outputs[0]->attr.dtype.zero_point;
+        output_scale = 1.0f / outputs[0]->attr.dtype.scale;
+    }
+    else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
+    {
+        output_fl = outputs[0]->attr.dtype.fl;
+        if (output_fl > 0)
+        {
+            output_scale = (float)((int64_t)1 << output_fl);
+        }
+        else
+        {
+            output_scale = (1.0f / (float)((int64_t)1 << -output_fl));
+        }
+        output_zp = 0.0f;
+    }
+
+    for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
+    {
+        ikernels[i] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_CL );
+        // Assign unique_id
+        ikernels[i]->unique_id = kernel->unique_id;
+    }
+
+    memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
+    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+    attr.is_const = FALSE;
+    attr.vtl = TRUE;
+    attr.size[0] = ((new_shape[0] + 15) / 16) * 4;
+    attr.size[1] = group_num;
+    attr.size[2] = 1;
+    attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
+    attr.dim_num = 4;
+    tensors[SUM_SQR_INDEX] = vsi_nn_CreateTensor( graph, &attr );
+
+    attr.size[0] = 4;
+    tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr );
+
+    in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+    if (in0_dtype == F16)
+    {
+        in0_dtype = F32;
+    }
+    if (out_dtype == F16)
+    {
+        out_dtype = F32;
+    }
+
+    hashkeys[SUM_SQR_INDEX]= HASH_GROUPNORM_SUM_SQR_KEY( in0_dtype, F32, is2D_flg );
+    hashkeys[MEAN_VARI_INDEX]= HASH_GROUPNORM_MEAN_VARI_KEY( F32, F32 );
+    hashkey = HASH_GROUPNORM_KEY( in0_dtype, out_dtype, is2D_flg );
+
+    status = _query_kernel( ikernels[SUM_SQR_INDEX], hashkeys[SUM_SQR_INDEX], INTERNAL_KERNEL_SUM_SQR );
+    if ( VSI_SUCCESS != status )
+    {
+        goto final;
+    }
+    status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI );
+    if ( VSI_SUCCESS != status )
+    {
+        goto final;
+    }
+    status = _query_kernel( kernel, hashkey, INTERNAL_KERNEL_NORM );
+    if ( VSI_SUCCESS != status )
+    {
+        goto final;
+    }
+
+    // Sum Sqr
+    tmp_node = vsi_nn_kernel_create_node( graph, ikernels[SUM_SQR_INDEX] );
+    if (tmp_node)
+    {
+        uint32_t index = 0;
+        sum_sqr_node_params[index++] = rs_input;
+        sum_sqr_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUM_SQR_INDEX]->t;
+        sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
+        sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is2D_flg );
+        sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_zp );
+        sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
+        sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+        sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
+
+        status  = vsi_nn_kernel_node_pass_param( tmp_node, sum_sqr_node_params,
+            _GROUPNORM_SUM_SQR_PARAM_NUM );
+        CHECK_STATUS(status);
+        vsi_nn_kernel_scalar_release( &sum_sqr_node_params[2] );
+        vsi_nn_kernel_scalar_release( &sum_sqr_node_params[3] );
+        vsi_nn_kernel_scalar_release( &sum_sqr_node_params[4] );
+        vsi_nn_kernel_scalar_release( &sum_sqr_node_params[5] );
+        vsi_nn_kernel_scalar_release( &sum_sqr_node_params[6] );
+        vsi_nn_kernel_scalar_release( &sum_sqr_node_params[7] );
+        vsi_nn_kernel_node_release( &tmp_node );
+    }
+
+    // mean vari
+    tmp_node1 = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] );
+    if (tmp_node1)
+    {
+        uint32_t index = 0;
+        mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUM_SQR_INDEX]->t;
+        mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
+        mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
+        mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &group_ratio );
+        mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &group_stride );
+
+        status  = vsi_nn_kernel_node_pass_param( tmp_node1, mean_vari_node_params,
+            _GROUPNORM_MEAN_VARI_PARAM_NUM );
+        CHECK_STATUS(status);
+        vsi_nn_kernel_scalar_release( &mean_vari_node_params[2] );
+        vsi_nn_kernel_scalar_release( &mean_vari_node_params[3] );
+        vsi_nn_kernel_scalar_release( &mean_vari_node_params[4] );
+        vsi_nn_kernel_node_release( &tmp_node1 );
+    }
+
+    // Nomalization
+    node = vsi_nn_kernel_create_node( graph, kernel );
+    if (node)
+    {
+        uint32_t index = 0;
+        int32_t  pStride = 0;
+        if (!is2D_flg)
+        {
+            pStride = inputs[1]->attr.size[0] / new_shape[1];
+            rSpaceOrg = 1.0f / (new_shape[0] / pStride);
+        }
+        node_params[index++] = rs_input;
+        node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
+        node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
+        node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
+        node_params[index++] = rs_output;
+        node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
+        node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is2D_flg );
+        node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_zp );
+        node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
+        node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
+        node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
+        node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rSpaceOrg );
+        node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+        node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
+        node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pStride );
+
+        status  = vsi_nn_kernel_node_pass_param( node, node_params,
+            _GROUPNORM_PARAM_NUM );
+        CHECK_STATUS(status);
+        vsi_nn_kernel_scalar_release( &node_params[5] );
+        vsi_nn_kernel_scalar_release( &node_params[6] );
+        vsi_nn_kernel_scalar_release( &node_params[7] );
+        vsi_nn_kernel_scalar_release( &node_params[8] );
+        vsi_nn_kernel_scalar_release( &node_params[9] );
+        vsi_nn_kernel_scalar_release( &node_params[10] );
+        vsi_nn_kernel_scalar_release( &node_params[11] );
+        vsi_nn_kernel_scalar_release( &node_params[12] );
+        vsi_nn_kernel_scalar_release( &node_params[13] );
+        vsi_nn_kernel_scalar_release( &node_params[14] );
+    }
+
+    /* Pass parameters to node. */
+final:
+    if (rs_input)
+    {
+        vsi_nn_kernel_tensor_release( &rs_input );
+    }
+    if (rs_output)
+    {
+        vsi_nn_kernel_tensor_release( &rs_output );
+    }
+    for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
+    {
+        if ( ikernels[i] )
+        {
+            vsi_nn_kernel_release( &ikernels[i] );
+        }
+        if ( tensors[i] )
+        {
+            vsi_nn_ReleaseTensor( &tensors[i] );
+        }
+    }
+#undef INTERNAL_KERNEL_SIZE
+#undef SUM_SQR_INDEX
+#undef MEAN_VARI_INDEX
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( group_norm, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cl/moments_cl.c b/src/tim/vx/internal/src/kernel/cl/moments_cl.c
index d258e39..59e3efa 100644
--- a/src/tim/vx/internal/src/kernel/cl/moments_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/moments_cl.c
@@ -176,19 +176,19 @@ static int32_t get_moments_output_reshape_size
     }
     sizes[3] = out_dims_num > 3 ? output_size[3] : 1;
 
-    if(axis_num == 1 && axis[0] == 0)
+    if (axis_num == 1 && axis[0] == 0)
     {
         sizes[0] = output_size[1];
         sizes[1] = out_dims_num > 2 ? output_size[2] : 1;
         out_rs_flg = 1;
     }
-    else if(axis_num == 1 && axis[0] == 1)
+    else if (axis_num == 1 && axis[0] == 1)
     {
         sizes[0] = output_size[0];
         sizes[1] = out_dims_num > 2 ? output_size[2] : 1;
         out_rs_flg = 1;
     }
-    else if(axis_num == 2 && axis[0] == 0 && axis[1] == 1)
+    else if (axis_num == 2 && axis[0] == 0 && axis[1] == 1)
     {
         sizes[0] = out_dims_num > 2 ? output_size[2] : 1;
         out_rs_flg = 1;
@@ -240,25 +240,25 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
     gpu_param.global_scale[0]  = 1;
     gpu_param.global_scale[1]  = 1;
     gpu_param.global_scale[2]  = 1;
-    if(axis_num == 1 && axis == 0)
+    if (axis_num == 1 && axis == 0)
     {
         gpu_param.global_size[0]   = gpu_align_p2((height + gpu_param.global_scale[0] - 1)
             / gpu_param.global_scale[0], 4);
         gpu_param.global_size[1]   = chn;
     }
-    else if(axis_num == 1 && axis == 1)
+    else if (axis_num == 1 && axis == 1)
     {
         gpu_param.global_size[0]   = gpu_align_p2((width + gpu_param.global_scale[0] - 1)
             / gpu_param.global_scale[0], 4);
         gpu_param.global_size[1]   = chn;
     }
-    else if(axis_num == 1 && axis == 2)
+    else if (axis_num == 1 && axis == 2)
     {
         gpu_param.global_size[0]   = gpu_align_p2((width + gpu_param.global_scale[0] - 1)
             / gpu_param.global_scale[0], 4);
         gpu_param.global_size[1]   = height;
     }
-    else if(axis_num == 2)
+    else if (axis_num == 2)
     {
         gpu_param.local_size[0]  = 16;
         gpu_param.local_size[1]  = 1;
@@ -266,7 +266,7 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
         gpu_param.global_size[0]   = 16;
         gpu_param.global_size[1]   = chn;
     }
-    else if(axis_num == 3)
+    else if (axis_num == 3)
     {
         gpu_param.local_size[0]  = 16;
         gpu_param.local_size[1]  = 1;
@@ -315,13 +315,13 @@ static vsi_status _query_kernel
 
     for( i = 0; i < _cnt_of_array(moments_map); i ++ )
     {
-        if( moments_map[i].key == key )
+        if ( moments_map[i].key == key )
         {
             break;
         }
     }
 
-    if( i < _cnt_of_array(moments_map) )
+    if ( i < _cnt_of_array(moments_map) )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  moments_map[i].function_name );
         kernel->info.parameters = _moments_kernel_param_def;
@@ -354,6 +354,7 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_MOMENTS_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_t node = NULL;
     int32_t  out_shape[VSI_NN_MAX_DIM_NUM] = {0};
+    int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
     int32_t  out_rs_flg = 0;
     int32_t axis_num  = 0;
     size_t axis_num_temp = 0;
@@ -362,6 +363,7 @@ static vsi_nn_kernel_node_t _setup
     int32_t first_axis = axis[0];
     int32_t i = 0;
     vsi_nn_kernel_scalar_t scalar_list[INTERNAL_MOMENTS_SCALAR_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t reshape_tensors[3] = { NULL };
 
     int32_t width = inputs[0]->attr.size[0];
     int32_t height = inputs[0]->attr.size[1];
@@ -372,7 +374,7 @@ static vsi_nn_kernel_node_t _setup
 
     axis_num = (int32_t)axis_num_temp;
 
-    if(inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
+    if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
     {
         if (inputs[0]->attr.dtype.fl > 0)
         {
@@ -385,38 +387,52 @@ static vsi_nn_kernel_node_t _setup
         input_zp = 0;
     }
 
-    if(axis_num == 1 && axis[0] == 0)
+    if (axis_num == 1 && axis[0] == 0)
     {
         dim_ratio = (float)1.0 / (float)(width);
     }
-    else if(axis_num == 1 && axis[0] == 1)
+    else if (axis_num == 1 && axis[0] == 1)
     {
         dim_ratio = (float)1.0 / (float)(height);
     }
-    else if(axis_num == 1 && axis[0] == 2)
+    else if (axis_num == 1 && axis[0] == 2)
     {
         dim_ratio = (float)1.0 / (float)(chn);
     }
-    else if(axis_num == 2 && axis[0] == 0 && axis[1] == 1)
+    else if (axis_num == 2 && axis[0] == 0 && axis[1] == 1)
     {
         dim_ratio = (float)1.0 / (float)(width * height);
     }
-    else if(axis_num == 3)
+    else if (axis_num == 3)
     {
         dim_ratio = (float)1.0 / (float)(width * height * chn);
     }
 
-    if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
         return NULL;
     }
 
-    if(keep_dim)
+    if (keep_dim)
     {
         out_rs_flg = get_moments_output_reshape_size(&outputs[0], out_shape, axis, axis_num);
     }
 
+    if (inputs[0]->attr.dim_num < 2)
+    {
+        shape[0] = inputs[0]->attr.size[0];
+        shape[1] = 1;
+        reshape_tensors[0] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 2 );
+    }
+    if (outputs[0]->attr.dim_num < 2)
+    {
+        shape[0] = outputs[0]->attr.size[0];
+        shape[1] = 1;
+        reshape_tensors[1] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 2 );
+        reshape_tensors[2] = vsi_nn_kernel_tensor_reshape( outputs[1]->t, shape, 2 );
+    }
+
     scalar_list[AXIS]       = vsi_nn_kernel_scalar_create( graph, I32, &first_axis );
     scalar_list[AXIS_NUM]   = vsi_nn_kernel_scalar_create( graph, I32, &axis_num );
     scalar_list[ZP]         = vsi_nn_kernel_scalar_create( graph, I32, &input_zp );
@@ -427,19 +443,31 @@ static vsi_nn_kernel_node_t _setup
     scalar_list[DIMRATIO]   = vsi_nn_kernel_scalar_create( graph, F32, &dim_ratio );
 
     status = _query_kernel( inputs, outputs, kernel, params, axis, axis_num, 0 );
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             uint32_t index = 0;
             /* Pass parameters to node. */
-            node_params[index++] = (vsi_nn_kernel_node_param_t)(inputs[0]->t);
-            if(out_rs_flg)
+            if (reshape_tensors[0])
+            {
+                node_params[index++] = reshape_tensors[0];
+            }
+            else
+            {
+                node_params[index++] = (vsi_nn_kernel_node_param_t)(inputs[0]->t);
+            }
+            if (out_rs_flg)
             {
                 node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, out_shape, 4 );
                 node_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[1]->t, out_shape, 4 );
             }
+            else if (reshape_tensors[1])
+            {
+                node_params[index++] = reshape_tensors[1];
+                node_params[index++] = reshape_tensors[2];
+            }
             else
             {
                 node_params[index++] = (vsi_nn_kernel_node_param_t)(outputs[0]->t);
@@ -455,7 +483,7 @@ static vsi_nn_kernel_node_t _setup
             node_params[index++] = scalar_list[DIMRATIO];
             status = vsi_nn_kernel_node_pass_param( node, node_params, _MOMENTS_PARAM_NUM );
             CHECK_STATUS(status);
-            if(out_rs_flg)
+            if (out_rs_flg)
             {
                 vsi_nn_kernel_tensor_release( &node_params[1] );
                 vsi_nn_kernel_tensor_release( &node_params[2] );
@@ -465,10 +493,22 @@ static vsi_nn_kernel_node_t _setup
         }
     }
 
+    if (reshape_tensors[0])
+    {
+        vsi_nn_kernel_tensor_release( &reshape_tensors[0] );
+    }
+    if (reshape_tensors[1])
+    {
+        vsi_nn_kernel_tensor_release( &reshape_tensors[1] );
+    }
+    if (reshape_tensors[2])
+    {
+        vsi_nn_kernel_tensor_release( &reshape_tensors[2] );
+    }
     /* Pass parameters to node. */
     for( i = 0; i < INTERNAL_MOMENTS_SCALAR_NUM; i ++ )
     {
-        if(scalar_list[i])
+        if (scalar_list[i])
         {
             vsi_nn_kernel_scalar_release( &scalar_list[i] );
         }
diff --git a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
new file mode 100644
index 0000000..bfbb653
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
@@ -0,0 +1,332 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_dtype_util.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    INTERNAL_KERNEL_ONE_HOT,
+} _internal_kernel_e;
+
+#define _ONE_HOT_KERNEL_SOURCE      "one_hot"
+
+// Add kernel hashtable here
+#define HASH_ONE_HOT_SH_KERNEL_NAME(SRC_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("cl.one_hot_"#SRC_TYPE"to"#DST_TYPE)
+
+#define ONE_HOT_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
+        (( IN_DTYPE << 8 ) | ( OUT_DTYPE ))
+
+#define PACK_ONE_HOT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+{ ONE_HOT_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
+  HASH_ONE_HOT_SH_KERNEL_NAME( IN_DTYPE, OUT_DTYPE ), \
+  _ONE_HOT_KERNEL_SOURCE }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _one_hot_kernel_map[] =
+{
+    // Register kernel here
+    PACK_ONE_HOT_KERNEL_MAP( F32, F32 ),
+    PACK_ONE_HOT_KERNEL_MAP( I32, I32 ),
+    PACK_ONE_HOT_KERNEL_MAP( I32, F32 ),
+    PACK_ONE_HOT_KERNEL_MAP( I32, U8 ),
+    PACK_ONE_HOT_KERNEL_MAP( U8,  U8 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _one_hot_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define SCALAR_INPUT_DEPTH           (2)
+#define SCALAR_INPUT_ON_VALUE        (3)
+#define SCALAR_INPUT_OFF_VALUE       (4)
+#define SCALAR_INPUT_SCALE           (5)
+#define SCALAR_INPUT_TAIL            (6)
+#define _ONE_HOT_PARAM_NUM  _cnt_of_array( _one_hot_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_one_hot_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        2,         // workdim
+        {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0}, // localWorkSize: local group size in thread
+        {0, 0, 0}  // globalWorkSize: image size in thread
+        };
+
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+    vsi_int_array_t * in_shape = NULL;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    in_shape  = attr[0]->shape;
+
+    gpu_param.global_scale[0] = 1;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_size[0] = gpu_align_p2(
+            (in_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = in_shape->data[1];
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(attr[0]);
+    SAFE_FREE_TENSOR_ATTR(attr[1]);
+#undef SAFE_FREE_TENSOR_ATTR
+
+    return status;
+} /* _one_hot_initializer() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _one_hot_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _one_hot_kernel_map );
+    vx_param_description_t * param_def  = _one_hot_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _one_hot_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (in_dtype == F16)
+    {
+        in_dtype = F32;
+    }
+
+    if (out_dtype == F16)
+    {
+        out_dtype = F32;
+    }
+    else if (out_dtype == I16 || out_dtype == I8)
+    {
+        out_dtype = I32;
+    }
+
+    key = ONE_HOT_HASH_KEY( in_dtype, out_dtype );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _one_hot_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_ONE_HOT_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_tensor_t* rs_tensors[2] = { NULL };
+    int32_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
+    int32_t i = 0;
+    int32_t num_elements = vsi_nn_vxGetTensorElementNum(&inputs[0]->attr);
+    int32_t prefix_dim_size = 1;
+    int32_t suffix_dim_size = 0;
+    int32_t depth = vsi_nn_kernel_param_get_int32( params, "depth" );
+    vsi_nn_kernel_dtype_e out_dtype;
+    uint32_t data[2] = {0};
+    float on_value = vsi_nn_kernel_param_get_float32( params, "on_value" );
+    float off_value = vsi_nn_kernel_param_get_float32( params, "off_value" );
+    int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
+    float inputScale = inputs[0]->attr.dtype.scale;
+    float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale;
+
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (out_dtype != F32 && out_dtype != F16)
+    {
+        vsi_nn_Float32ToDtype(on_value, (uint8_t*)&data[0], &outputs[0]->attr.dtype);
+        vsi_nn_Float32ToDtype(off_value, (uint8_t*)&data[1], &outputs[0]->attr.dtype);
+    }
+    else
+    {
+        data[0] = *(uint32_t*)&on_value;
+        data[1] = *(uint32_t*)&off_value;
+    }
+
+    axis = axis == -1 ? (int32_t)inputs[0]->attr.dim_num : (int32_t)inputs[0]->attr.dim_num - axis;
+    for (i = 0; i < axis; i++)
+    {
+        prefix_dim_size *= inputs[0]->attr.size[i];
+    }
+
+    suffix_dim_size = num_elements / prefix_dim_size;
+
+    shape[0][0] = suffix_dim_size;
+    shape[0][1] = prefix_dim_size;
+    shape[1][0] = suffix_dim_size;
+    shape[1][1] = depth;
+    shape[1][2] = prefix_dim_size;
+
+    rs_tensors[0] = vsi_nn_reshape_tensor( graph,
+        inputs[0], (uint32_t*)shape[0], 2 );
+    rs_tensors[1] = vsi_nn_reshape_tensor( graph,
+        outputs[0], (uint32_t*)shape[1], 3 );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[1]->attr.size,
+                rs_tensors[1]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _ONE_HOT_PARAM_NUM,
+                    &rs_tensors[0], input_num, &rs_tensors[1], output_num );
+            node_params[SCALAR_INPUT_DEPTH] = vsi_nn_kernel_scalar_create(
+                graph, I32, &depth );
+            node_params[SCALAR_INPUT_ON_VALUE] = vsi_nn_kernel_scalar_create(
+                graph, U32, &data[0] );
+            node_params[SCALAR_INPUT_OFF_VALUE] = vsi_nn_kernel_scalar_create(
+                graph, U32, &data[1] );
+            node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &inputScale );
+            node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &inputTail );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _ONE_HOT_PARAM_NUM );
+            CHECK_STATUS_FAIL_GOTO( status, final );
+        }
+    }
+
+final:
+    if (rs_tensors[0])
+    {
+        vsi_nn_ReleaseTensor( &rs_tensors[0] );
+    }
+
+    if (rs_tensors[1])
+    {
+        vsi_nn_ReleaseTensor( &rs_tensors[1] );
+    }
+
+    for (i = SCALAR_INPUT_DEPTH; i < _ONE_HOT_PARAM_NUM; i++)
+    {
+        if (node_params[i])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[i] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( one_hot, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c
index ee5b0a4..6f112bd 100644
--- a/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c
@@ -178,11 +178,19 @@ static vsi_status _query_kernel
     {
         in_dtype  = F32;
     }
+    else if (I16 == in_dtype && inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE)
+    {
+        in_dtype  = I32;
+    }
 
     if (F16 == out_dtype)
     {
         out_dtype  = F32;
     }
+    else if (I16 == out_dtype && outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE)
+    {
+        out_dtype  = I32;
+    }
 
     key = HASH_REDUCEMAX_HASH_KEY( axis, in_dtype, out_dtype, image_2d );
 
diff --git a/src/tim/vx/internal/src/kernel/cl/repeat_cl.c b/src/tim/vx/internal/src/kernel/cl/repeat_cl.c
new file mode 100644
index 0000000..d133782
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/repeat_cl.c
@@ -0,0 +1,407 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define KERNEL_SOURCE_1    "repeat"
+
+// Add kernel hashtable here
+
+#define HASH_REPEAT_KERNEL_NAME(SRC0_TYPE, AXIS) \
+    CVIVANTE_NAMESPACE("cl.repeat_"#SRC0_TYPE"_axis"#AXIS)
+
+#define HASH_REPEAT_KERNEL_1D_NAME(SRC0_TYPE) \
+    CVIVANTE_NAMESPACE("cl.repeat_"#SRC0_TYPE"_1D")
+
+// Add kernel hashtable here
+#define HASH_REPEAT_KEY(_input0_type, _output_type, _is1d, _axis) \
+    ((_input0_type << 24) | (_output_type << 16) | (_is1d << 8) | _axis)
+
+#define TENSOR_REPEAT_KERNELS(IN0_TYPE, OUT_TYPE, AXIS, SOURCE) \
+    { HASH_REPEAT_KEY(IN0_TYPE, OUT_TYPE, 0, AXIS), \
+        HASH_REPEAT_KERNEL_NAME(IN0_TYPE, AXIS), \
+        SOURCE },
+
+#define TENSOR_REPEAT_1D_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_REPEAT_KEY(IN0_TYPE, OUT_TYPE, 1, 0), \
+        HASH_REPEAT_KERNEL_1D_NAME(IN0_TYPE), \
+        SOURCE },
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _repeat_kernel_map[] =
+{
+    // Register kernel here
+    TENSOR_REPEAT_KERNELS( I32, I32, 0, KERNEL_SOURCE_1 )
+    TENSOR_REPEAT_KERNELS( I32, I32, 1, KERNEL_SOURCE_1 )
+    TENSOR_REPEAT_KERNELS( I32, I32, 2, KERNEL_SOURCE_1 )
+    TENSOR_REPEAT_KERNELS( F32, F32, 0, KERNEL_SOURCE_1 )
+    TENSOR_REPEAT_KERNELS( F32, F32, 1, KERNEL_SOURCE_1 )
+    TENSOR_REPEAT_KERNELS( F32, F32, 2, KERNEL_SOURCE_1 )
+
+    TENSOR_REPEAT_1D_KERNELS( I32,  I32,  KERNEL_SOURCE_1 )
+    TENSOR_REPEAT_1D_KERNELS( F32,  F32,  KERNEL_SOURCE_1 )
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _repeat_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _REPEAT_PARAM_NUM  _cnt_of_array( _repeat_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_repeat_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    vsi_int_array_t * input_shape = NULL;
+    int32_t height = 0, width = 0, chn = 0;
+    int32_t is1d = 0;
+    int32_t axis = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &axis);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    input_shape  = attr[0]->shape;
+    width = input_shape->data[0];
+    height = input_shape->data[1];
+    if (height == 1 && input_shape->size == 2)
+    {
+        is1d = 1;
+    }
+    chn = input_shape->size > 2 ? input_shape->data[2] : 1;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+    gpu_param.global_size[0]   = width;
+    gpu_param.global_size[1]   = height;
+    gpu_param.global_size[2]   = chn;
+    if (is1d || axis == 1)
+    {
+        gpu_param.global_size[0]   = 1;
+    }
+    else if (axis == 0)
+    {
+        gpu_param.global_size[1] = 1;
+    }
+    else if (axis == 2)
+    {
+        gpu_param.global_size[2] = 1;
+    }
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _repeat_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t axis
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    int32_t is1d = inputs[0]->attr.dim_num == 1 ? 1 : 0;
+    uint32_t key = 0;
+    int i = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (input0_dtype == F16)
+    {
+        input0_dtype = F32;
+    }
+    if (output_dtype == F16)
+    {
+        output_dtype = F32;
+    }
+
+    key = HASH_REPEAT_KEY( input0_dtype, output_dtype, is1d, axis );
+
+    for( i = 0; i < _cnt_of_array(_repeat_kernel_map); i ++ )
+    {
+        if ( _repeat_kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < _cnt_of_array(_repeat_kernel_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _repeat_kernel_map[i].function_name );
+        kernel->info.parameters = _repeat_kernel_param_def;
+        kernel->info.numParams = _REPEAT_PARAM_NUM;
+        kernel->info.initialize = _repeat_initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                _repeat_kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                _repeat_kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+static int32_t _optimize_repeat_shape
+    (
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    int32_t* axis,
+    int32_t* opt_shape_in,
+    int32_t* opt_shape_out,
+    int32_t* new_rank
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    if (inputs[0]->attr.dim_num == 1)
+    {
+        opt_shape_in[0] = inputs[0]->attr.size[0];
+        opt_shape_in[1] = 1;
+        opt_shape_out[0] = outputs[0]->attr.size[0];
+        opt_shape_out[1] = 1;
+        new_rank[0] = 2;
+        new_rank[1] = 2;
+    }
+    else if (axis[0] == 3)
+    {
+        vsi_nn_kernel_optimize_element_shape( (int32_t*)inputs[0]->attr.size, 3, opt_shape_in, new_rank );
+        if (opt_shape_in[1] == 1)
+        {
+            opt_shape_in[1] = inputs[0]->attr.size[3];
+            opt_shape_out[0] = opt_shape_in[0];
+            opt_shape_out[1] = outputs[0]->attr.size[3];
+            axis[0] = 0;
+            new_rank[0] = 2;
+            new_rank[1] = 2;
+        }
+        else if (new_rank[0] == 2)
+        {
+            opt_shape_in[2] = inputs[0]->attr.size[3];
+            opt_shape_out[0] = opt_shape_in[0];
+            opt_shape_out[1] = opt_shape_in[1];
+            opt_shape_out[2] = outputs[0]->attr.size[3];
+            axis[0] = 2;
+            new_rank[0] = 3;
+            new_rank[1] = 3;
+        }
+        else
+        {
+            status = VSI_FAILURE;
+        }
+    }
+
+    return status;
+}
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_REPEAT_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_kernel_tensor_t rs_input = NULL, rs_input1 = NULL, rs_output = NULL;
+    int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }, { 1, 1, 1, 1 }};
+    int32_t new_rank[2] = {0, 0};
+    int32_t axis  = vsi_nn_kernel_param_get_int32( params, "axis" );
+
+    int32_t width = inputs[0]->attr.size[0];
+    int32_t height = inputs[0]->attr.dim_num > 1 ? inputs[0]->attr.size[1] : 1;
+    int32_t channel = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
+
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    if (axis > 2 || outputs[0]->attr.dim_num == 1)
+    {
+        status = _optimize_repeat_shape(inputs, outputs, &axis, new_shape[0], new_shape[1], new_rank);
+        if ( VSI_SUCCESS != status )
+        {
+            goto final;
+        }
+        rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape[0], new_rank[0]);
+        rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[1], new_rank[1]);
+
+        width = new_shape[0][0];
+        height = new_shape[0][1];
+        channel = new_rank[0] > 2 ? new_shape[0][2]: 1;
+    }
+
+    if (inputs[1]->attr.dim_num == 1)
+    {
+        new_shape[0][0] = inputs[1]->attr.size[0];
+        new_shape[0][1] = 1;
+        rs_input1 = vsi_nn_kernel_tensor_reshape(inputs[1]->t, new_shape[0], 2);
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, axis );
+    if ( VSI_SUCCESS != status )
+    {
+        goto final;
+    }
+
+    node = vsi_nn_kernel_create_node( graph, kernel );
+    if (node)
+    {
+        uint32_t index = 0;
+        if (rs_input)
+        {
+            node_params[index++] = rs_input;
+        }
+        else
+        {
+            node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
+        }
+        if (rs_input1)
+        {
+            node_params[index++] = rs_input1;
+        }
+        else
+        {
+            node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
+        }
+        if (rs_output)
+        {
+            node_params[index++] = rs_output;
+        }
+        else
+        {
+            node_params[index++] = (vsi_nn_kernel_node_param_t)outputs[0]->t;
+        }
+        node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+        node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
+        node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &channel );
+        node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
+
+        status  = vsi_nn_kernel_node_pass_param( node, node_params,
+            _REPEAT_PARAM_NUM );
+        CHECK_STATUS(status);
+        vsi_nn_kernel_scalar_release( &node_params[3] );
+        vsi_nn_kernel_scalar_release( &node_params[4] );
+        vsi_nn_kernel_scalar_release( &node_params[5] );
+        vsi_nn_kernel_scalar_release( &node_params[6] );
+    }
+
+    /* Pass parameters to node. */
+final:
+    if (rs_input)
+    {
+        vsi_nn_kernel_tensor_release( &rs_input );
+    }
+    if (rs_input1)
+    {
+        vsi_nn_kernel_tensor_release( &rs_input1 );
+    }
+    if (rs_output)
+    {
+        vsi_nn_kernel_tensor_release( &rs_output );
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( repeat, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c b/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c
new file mode 100644
index 0000000..45e606e
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c
@@ -0,0 +1,354 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
+#include "math.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define KERNEL_SOURCE_1    "sequence_mask"
+
+#define HASH_SEQUENCE_MASK_KEY(_input0_type, _output_type, _image_2d) \
+    ((_input0_type << 24) | (_output_type << 8) | (_image_2d))
+
+#define HASH_SEQUENCE_MASK_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("cl.sequence_mask_"#SRC0_TYPE"to"#DST_TYPE)
+
+ #define HASH_SEQUENCE_MASK_SH_2DKERNEL_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("cl.sequence_mask_"#SRC0_TYPE"to"#DST_TYPE"_2D")
+
+#define TENSOR_SEQUENCE_MASK_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_SEQUENCE_MASK_KEY(IN0_TYPE, OUT_TYPE, 0), \
+        HASH_SEQUENCE_MASK_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
+ #define TENSOR_SEQUENCE_MASK_2D_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_SEQUENCE_MASK_KEY(IN0_TYPE, OUT_TYPE, 1), \
+        HASH_SEQUENCE_MASK_SH_2DKERNEL_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
+static const struct {
+        uint32_t key;
+        char* function_name;
+        const char* source_name;
+    } kernel_map[] =
+{
+    TENSOR_SEQUENCE_MASK_KERNELS(I32, U8,     KERNEL_SOURCE_1)
+    TENSOR_SEQUENCE_MASK_KERNELS(I32, I32,    KERNEL_SOURCE_1)
+    TENSOR_SEQUENCE_MASK_KERNELS(I32, F32,    KERNEL_SOURCE_1)
+    TENSOR_SEQUENCE_MASK_2D_KERNELS(I32, U8,  KERNEL_SOURCE_1)
+    TENSOR_SEQUENCE_MASK_2D_KERNELS(I32, I32, KERNEL_SOURCE_1)
+    TENSOR_SEQUENCE_MASK_2D_KERNELS(I32, F32, KERNEL_SOURCE_1)
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+
+#define _CL_PARAM_NUM          _cnt_of_array(kernel_param_def)
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_sequence_mask_initializer)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_status    status             = VSI_FAILURE;
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    vsi_int_array_t * out_shape = NULL;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+
+    out_shape  = attr[0]->shape;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.global_size[0]   = gpu_align_p2((out_shape->data[0] + gpu_param.global_scale[0] - 1)
+                                        / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1]   = (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+                                        / gpu_param.global_scale[1];
+    gpu_param.global_size[2]   = out_shape->size > 2 ? out_shape->data[2] : 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+
+    return status;
+} /* _sequence_mask_initializer() */
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel,
+    int32_t is2Dflg
+    )
+{
+    vsi_nn_kernel_dtype_e input0_dtype = I32;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    vsi_status status = VSI_FAILURE;
+    uint32_t key = 0;
+    int i = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+    if (output_dtype == BOOL8)
+    {
+        output_dtype= U8;
+    }
+
+    key = HASH_SEQUENCE_MASK_KEY( input0_dtype, output_dtype, is2Dflg );
+
+    for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(kernel_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters = kernel_param_def;
+        kernel->info.numParams = _cnt_of_array( kernel_param_def );
+        kernel->info.initialize = _sequence_mask_initializer;
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                kernel_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+static int32_t _optimize_mask_shape
+    (
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    int32_t max_len,
+    int32_t* opt_shape_in,
+    int32_t* opt_shape_out,
+    int32_t* is2Dflg
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    int32_t in_shape[VSI_NN_MAX_DIM_NUM] = {0};
+    int32_t new_rank = 0;
+    uint32_t i = 0;
+
+    for(i = 0; i < inputs[0]->attr.dim_num; i++)
+    {
+        in_shape[i] = inputs[0]->attr.size[i];
+    }
+
+    vsi_nn_kernel_optimize_element_shape( in_shape, inputs[0]->attr.dim_num, opt_shape_in, &new_rank );
+    if (new_rank > 2)
+    {
+        return VSI_FAILURE;
+    }
+
+    opt_shape_out[0] = max_len;
+    for(i = 0; i < (uint32_t)new_rank; i++)
+    {
+        opt_shape_out[i + 1] = opt_shape_in[i];
+    }
+    if (opt_shape_out[2] == 1)
+    {
+        is2Dflg[0] = 1;
+    }
+
+    return status;
+}
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
+    int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }, { 1, 1, 1, 1 }};
+    int32_t max_len  = vsi_nn_kernel_param_get_int32( params, "max_len" );
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t is2Dflg = 0;
+    float input_zp = 0;
+    float input_scale = 1.0f;
+    int32_t output_zp = 0;
+    float output_scale = 1.0f;
+    float input_zpScale = 0;
+    float outputVal1 = 1.0f;
+    int32_t input_fl = 0;
+    int32_t output_fl = 0;
+
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _optimize_mask_shape(inputs, outputs, max_len, new_shape[0], new_shape[1], &is2Dflg);
+    if ( VSI_SUCCESS != status )
+    {
+        goto final;
+    }
+    rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape[0], 2);
+    rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[1], 4);
+
+    if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
+    {
+        input_zp = (float)inputs[0]->attr.dtype.zero_point;
+        input_scale = inputs[0]->attr.dtype.scale;
+    }
+    else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
+    {
+        input_fl = inputs[0]->attr.dtype.fl;
+        if (input_fl > 0)
+        {
+            input_scale = (1.0f / ((float) ((int64_t)1 << input_fl)));
+        }
+        else
+        {
+            input_scale = ((float) ((int64_t)1 << -input_fl));
+        }
+        input_zp = 0.0f;
+    }
+
+    if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
+    {
+        output_zp = outputs[0]->attr.dtype.zero_point;
+        output_scale = 1.0f / outputs[0]->attr.dtype.scale;
+    }
+    else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
+    {
+        output_fl = outputs[0]->attr.dtype.fl;
+        if (output_fl > 0)
+        {
+            output_scale = (float)((int64_t)1 << output_fl);
+        }
+        else
+        {
+            output_scale = (1.0f / (float)((int64_t)1 << -output_fl));
+        }
+        output_zp = 0;
+    }
+    input_zpScale = input_scale * input_zp;
+    outputVal1 = output_scale + (float)output_zp;
+
+    status = _query_kernel( inputs, outputs, kernel, is2Dflg );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+
+        if ( node )
+        {
+            uint32_t index = 0;
+            node_params[index++] = rs_input;
+            node_params[index++] = rs_output;
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &max_len );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_zpScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputVal1 );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &output_zp );
+
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
+            VSI_ASSERT( status == VSI_SUCCESS );
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+        }
+    }
+
+final:
+    if (rs_input)
+    {
+        vsi_nn_kernel_tensor_release( &rs_input );
+    }
+    if (rs_output)
+    {
+        vsi_nn_kernel_tensor_release( &rs_output );
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( sequence_mask, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cl/slice_cl.c b/src/tim/vx/internal/src/kernel/cl/slice_cl.c
new file mode 100644
index 0000000..d05a32e
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/slice_cl.c
@@ -0,0 +1,308 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+    /*
+    * Define kernel meta.
+    */
+    typedef enum
+{
+    INTERNAL_KERNEL_SLICE,
+} _internal_kernel_e;
+
+#define _SLICE_KERNEL_SOURCE      "slice"
+#define SLICE_SH_KERNEL_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
+    CVIVANTE_NAMESPACE("cl.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE)
+
+// Add kernel hashtable here
+#define SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE , _IMAGE_2D) \
+    (( IN1_DTYPE << 18 ) | ( IN0_DTYPE << 10 ) | ( OUT_DTYPE << 2 ) | (_IMAGE_2D))
+
+#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
+{   SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0 ), \
+    SLICE_SH_KERNEL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
+
+#define SLICE_SH_KERNEL_2D_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
+    CVIVANTE_NAMESPACE("cl.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_2D")
+
+#define PACK_KERNEL_MAP_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
+{   SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1 ), \
+    SLICE_SH_KERNEL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
+
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _slice_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( F32, I32, F32, _SLICE_KERNEL_SOURCE ),
+    PACK_KERNEL_MAP( I32, I32, I32, _SLICE_KERNEL_SOURCE ),
+    PACK_KERNEL_MAP( U8,  I32, U8,  _SLICE_KERNEL_SOURCE ),
+
+    PACK_KERNEL_MAP_2D( F32, I32, F32, _SLICE_KERNEL_SOURCE ),
+    PACK_KERNEL_MAP_2D( I32, I32, I32, _SLICE_KERNEL_SOURCE ),
+    PACK_KERNEL_MAP_2D( U8,  I32, U8,  _SLICE_KERNEL_SOURCE ),
+};
+
+#define _INPUT_NUM          (2)
+#define _OUTPUT_NUM         (1)
+#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
+
+/*
+* Kernel params
+*/
+static vx_param_description_t _slice_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _SLICE_PARAM_NUM  _cnt_of_array( _slice_kernel_param_def )
+#define SCALAR_INPUT_SCALE          (3)
+#define SCALAR_INPUT_TAIL           (4)
+#define SCALAR_OUTPUT_SCALE         (5)
+#define SCALAR_OUTPUT_ZP            (6)
+/*
+* Kernel initializer
+*/
+DEF_KERNEL_INITIALIZER(_slice_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+    };
+    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
+    vsi_int_array_t * out_shape                 = NULL;
+
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    out_shape  = output_attr->shape;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3;
+    gpu_param.global_size[0] = gpu_align_p2(
+        (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+        / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+        (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+        / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+    final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(output_attr);
+    return status;
+} /* _slice_initializer() */
+
+/*
+* Query kernel
+*/
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    vsi_bool image_2d
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype;
+    vsi_nn_kernel_dtype_e in1_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _slice_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _slice_kernel_map );
+    vx_param_description_t * param_def  = _slice_kernel_param_def;
+    size_t param_def_size               = _cnt_of_array( _slice_kernel_param_def );
+    vx_kernel_initialize_f  initializer = _slice_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in0_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype  = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (F16 == in0_dtype)
+    {
+        in0_dtype = F32;
+    }
+
+    if (F16 == out_dtype)
+    {
+        out_dtype = F32;
+    }
+
+    key = SLICE_HASH_KEY( in0_dtype, in1_dtype, out_dtype, image_2d );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_def_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+            kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_SLICE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_bool image_2d = FALSE;
+    uint32_t rank[_IO_NUM] = {0};
+    int32_t  shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
+    vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL };
+    int32_t i = 0;
+    int32_t input_batch = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
+    int32_t output_batch = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1;
+    float inputScale = inputs[0]->attr.dtype.scale;
+    float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale;
+    float outputScale = outputs[0]->attr.dtype.scale;
+    float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f;
+
+    outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
+
+    vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num,
+        shapes[0], &rank[0]);
+    vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num,
+        shapes[1], &rank[1]);
+    vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num,
+        shapes[2], &rank[2]);
+
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        reshape_tensors[i] = vsi_nn_reshape_tensor( graph,
+            inputs[i], (uint32_t*)shapes[i], rank[i] );
+    }
+    reshape_tensors[_INPUT_NUM] = vsi_nn_reshape_tensor( graph,
+        outputs[0], (uint32_t*)shapes[_INPUT_NUM], rank[_INPUT_NUM] );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[0]->attr.size,
+        inputs[0]->attr.dim_num ) || input_batch != output_batch )
+    {
+        return NULL;
+    }
+
+    image_2d = (rank[0] < 3 || shapes[0][2] == 1);
+
+    status = _query_kernel( kernel, inputs, outputs, image_2d );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _SLICE_PARAM_NUM,
+                reshape_tensors, input_num, &reshape_tensors[_INPUT_NUM], output_num );
+            node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &inputScale );
+            node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &inputTail );
+            node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &outputScale );
+            node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &outputZP );
+
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _SLICE_PARAM_NUM );
+            VSI_ASSERT( status == VSI_SUCCESS );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+    REGISTER_BACKEND_CL( slice, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c b/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c
index be3424b..e9cb96f 100644
--- a/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c
@@ -36,7 +36,7 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 __BEGIN_DECLS
 
 #define _CPU_ARG_NUM            (1)
diff --git a/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c b/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c
index 7b0f1dd..f6b092b 100644
--- a/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c
@@ -36,7 +36,7 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/axis_aligned_bbox_transform_cpu.c b/src/tim/vx/internal/src/kernel/cpu/axis_aligned_bbox_transform_cpu.c
new file mode 100644
index 0000000..448eb33
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/axis_aligned_bbox_transform_cpu.c
@@ -0,0 +1,279 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (4)
+#define _OUTPUT_NUM         (1)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.axis_aligned_bbox_transform")
+
+typedef struct vsi_nn_box_encoding_corner_t
+{
+    float x1, y1, x2, y2;
+}vsi_nn_box_encoding_corner;
+
+typedef struct vsi_nn_box_encoding_center_t
+{
+    float w, h, x, y;
+}vsi_nn_box_encoding_center;
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _axis_aligned_bbox_transform_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM  _cnt_of_array( _axis_aligned_bbox_transform_kernel_param_def )
+
+
+static void _to_box_encoding_corner
+    (
+    vsi_nn_box_encoding_center* ctr,
+    vsi_nn_box_encoding_corner* cnr
+    )
+{
+    cnr->x1 = ctr->x - ctr->w / 2;
+    cnr->y1 = ctr->y - ctr->h / 2;
+    cnr->x2 = ctr->x + ctr->w / 2;
+    cnr->y2 = ctr->y + ctr->h / 2;
+}
+
+static void _to_box_encoding_center
+    (
+    vsi_nn_box_encoding_corner* cnr,
+    vsi_nn_box_encoding_center* ctr
+    )
+{
+    ctr->w = cnr->x2 - cnr->x1;
+    ctr->h = cnr->y2 - cnr->y1;
+    ctr->x = (cnr->x1 + cnr->x2) / 2;
+    ctr->y = (cnr->y1 + cnr->y2) / 2;
+}
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    float *f32_in_buffer[_INPUT_NUM] = {NULL};
+    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
+    size_t   out_elements[_OUTPUT_NUM] = {0};
+    size_t   out_bytes[_OUTPUT_NUM] = {0};
+    uint32_t  i;
+    const uint32_t roiLength = 4;
+    const uint32_t imageLength = 2;
+    uint32_t numClasses = 0;
+    uint32_t numRois = 0;
+    uint32_t j;
+    uint32_t roiIndex;
+
+    /* prepare data */
+    for (i = 0; i < _INPUT_NUM; i ++)
+    {
+        input[i] = (vsi_nn_kernel_tensor_t)param[i];
+        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
+        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
+        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
+
+    }
+    for (i = 0; i < _OUTPUT_NUM; i ++)
+    {
+        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
+        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
+        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
+        out_bytes[i] = out_elements[i] * sizeof(float);
+        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
+        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
+        memset( f32_out_buffer[i], 0, out_bytes[i] );
+    }
+
+    numClasses = in_attr[1]->shape->data[0] / roiLength;
+    numRois = in_attr[0]->shape->data[1];
+
+    for (roiIndex = 0; roiIndex < numRois; roiIndex++)
+    {
+        uint32_t batchIndex = (uint32_t)f32_in_buffer[2][roiIndex];
+        float imageHeight = f32_in_buffer[3][batchIndex * imageLength];
+        float imageWidth = f32_in_buffer[3][batchIndex * imageLength + 1];
+        vsi_nn_box_encoding_corner roi_cnr;
+        vsi_nn_box_encoding_center roiBefore;
+        roi_cnr.x1 = f32_in_buffer[0][roiIndex * roiLength];
+        roi_cnr.y1 = f32_in_buffer[0][roiIndex * roiLength + 1];
+        roi_cnr.x2 = f32_in_buffer[0][roiIndex * roiLength + 2];
+        roi_cnr.y2 = f32_in_buffer[0][roiIndex * roiLength + 3];
+        _to_box_encoding_center(&roi_cnr, &roiBefore);
+
+        for (j = 0; j < numClasses; j++)
+        {
+            vsi_nn_box_encoding_center roi_ctr;
+            vsi_nn_box_encoding_corner roiAfter;
+            vsi_nn_box_encoding_corner cliped;
+            uint32_t index = (roiIndex * numClasses + j) * roiLength;
+
+            roi_ctr.w = (float)(exp(f32_in_buffer[1][index + 2]) * roiBefore.w);
+            roi_ctr.h = (float)(exp(f32_in_buffer[1][index + 3]) * roiBefore.h);
+            roi_ctr.x = roiBefore.x + f32_in_buffer[1][index] * roiBefore.w;
+            roi_ctr.y = roiBefore.y + f32_in_buffer[1][index + 1] * roiBefore.h;
+            _to_box_encoding_corner(&roi_ctr, &roiAfter);
+
+            cliped.x1 = vsi_nn_min(vsi_nn_max(roiAfter.x1, 0.0f), imageWidth);
+            cliped.y1 = vsi_nn_min(vsi_nn_max(roiAfter.y1, 0.0f), imageHeight);
+            cliped.x2 = vsi_nn_min(vsi_nn_max(roiAfter.x2, 0.0f), imageWidth);
+            cliped.y2 = vsi_nn_min(vsi_nn_max(roiAfter.y2, 0.0f), imageHeight);
+            f32_out_buffer[0][index] = cliped.x1;
+            f32_out_buffer[0][index + 1] = cliped.y1;
+            f32_out_buffer[0][index + 2] = cliped.x2;
+            f32_out_buffer[0][index + 3] = cliped.y2;
+        }
+    }
+
+    /* save data */
+    for(i = 0; i < _OUTPUT_NUM; i++)
+    {
+        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
+                f32_out_buffer[i], out_elements[i] );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+
+final:
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        if (f32_in_buffer[i])
+        {
+            free(f32_in_buffer[i]);
+            f32_in_buffer[i] = NULL;
+        }
+        if (in_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
+        }
+    }
+    for (i = 0; i < _OUTPUT_NUM; i++)
+    {
+        if (f32_out_buffer[i])
+        {
+            free(f32_out_buffer[i]);
+            f32_out_buffer[i] = NULL;
+        }
+        if (out_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
+        }
+    }
+
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _axis_aligned_bbox_transform_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _axis_aligned_bbox_transform_kernel_param_def );
+    status = VSI_SUCCESS;
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+
+    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+    if( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _AXIS_ALIGNED_BBOX_TRANSFORM_PARAM_NUM );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( axis_aligned_bbox_transform, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c b/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c
index cd4f594..ca6164b 100644
--- a/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
 
 __BEGIN_DECLS
diff --git a/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c b/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c
index f64f102..8397b30 100644
--- a/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c
@@ -32,7 +32,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -164,8 +164,8 @@ DEF_KERNEL_EXECUTOR(_comparisons_exec)
         buffer[2][i] = (float)data;
     }
 
-    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
-            buffer[1], out_elements );
+    status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
+            buffer[2], out_elements );
     CHECK_STATUS_FAIL_GOTO( status, final );
 
 final:
diff --git a/src/tim/vx/internal/src/kernel/cpu/conv1d_ovxlib_cpu.c b/src/tim/vx/internal/src/kernel/cpu/conv1d_ovxlib_cpu.c
new file mode 100644
index 0000000..f1b1b9e
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/conv1d_ovxlib_cpu.c
@@ -0,0 +1,264 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.conv1d_ovxlib")
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _conv1d_ovxlib_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _CONV1D_OVXLIB_PARAM_NUM  _cnt_of_array( _conv1d_ovxlib_kernel_param_def )
+#define _IO_COUNT       (4)
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    int i = 0;
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t tensors[_IO_COUNT] = { NULL };
+    vsi_nn_kernel_tensor_attr_t* attr[_IO_COUNT] = { NULL };
+    float* buffer[_IO_COUNT] = { NULL };
+    int32_t stride = 0;
+    int32_t pad_front = 0;
+    int32_t pad_end = 0;
+    int32_t dilation = 0;
+    int32_t overflow_policy = 0;
+    int32_t rounding_policy = 0;
+    int32_t down_scale_size_rounding = 0;
+
+    tensors[0] = (vsi_nn_kernel_tensor_t)param[0];
+    tensors[1] = (vsi_nn_kernel_tensor_t)param[1];
+    tensors[2] = (vsi_nn_kernel_tensor_t)param[2];
+    tensors[3] = (vsi_nn_kernel_tensor_t)param[3];
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
+    attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
+    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input buffer fail.", final );
+    buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
+    buffer[3] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[3], attr[3], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[3], "Create input buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &stride);
+    CHECK_STATUS_FAIL_GOTO(status, final);
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &pad_front);
+    CHECK_STATUS_FAIL_GOTO(status, final);
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &pad_end);
+    CHECK_STATUS_FAIL_GOTO(status, final);
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &dilation);
+    CHECK_STATUS_FAIL_GOTO(status, final);
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &overflow_policy);
+    CHECK_STATUS_FAIL_GOTO(status, final);
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &rounding_policy);
+    CHECK_STATUS_FAIL_GOTO(status, final);
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &down_scale_size_rounding);
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+    {
+        int32_t batch = attr[0]->shape->data[2];
+        int32_t input_channel = attr[0]->shape->data[1];
+        int32_t input_height = attr[0]->shape->data[0];
+        int32_t kernel_size = attr[1]->shape->data[0];
+        int32_t output_channel = attr[1]->shape->data[2];
+        int32_t output_height = attr[3]->shape->data[0];
+        int32_t batch_index = 0;
+        int32_t input_channel_index = 0;
+        int32_t output_channel_index = 0;
+        int32_t output_h_index = 0;
+
+        for(batch_index = 0; batch_index < batch; batch_index++)
+        {
+            float* per_batch_input = buffer[0] + batch_index * input_channel * input_height;
+            float* per_batch_output = buffer[3] + batch_index * output_channel * output_height;
+            for(output_channel_index = 0; output_channel_index < output_channel; output_channel_index++)
+            {
+                float* filter = buffer[1] + output_channel_index * input_channel * kernel_size;
+                for(output_h_index = 0; output_h_index < output_height; output_h_index++)
+                {
+                    float output_value = 0.;
+                    float* current_value_ptr = per_batch_input + output_h_index * stride;
+
+                    for(input_channel_index = 0; input_channel_index < input_channel; input_channel_index++)
+                    {
+                        int k = 0;
+                        int32_t index = 0;
+                        for(k = 0; k < kernel_size; k++)
+                        {
+                            float w = *(filter + input_channel_index * kernel_size + k);
+                            float v = *(current_value_ptr + input_channel_index * input_height + index);
+
+                            output_value += w * v;
+                            index += dilation;
+                        }
+                    }
+
+                    if(buffer[2])
+                    {
+                        output_value += buffer[2][output_channel_index];
+                    }
+
+                    *(per_batch_output + output_channel_index * output_height + output_h_index) = output_value;
+                }
+            }
+        }
+        status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
+                buffer[3], batch * output_channel * output_height );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+
+final:
+    for( i = 0; i < _IO_COUNT; i ++ )
+    {
+        if( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+        vsi_nn_kernel_tensor_attr_release( &attr[i] );
+    }
+
+    return status;
+
+} /* _compute() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _conv1d_ovxlib_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _conv1d_ovxlib_kernel_param_def );
+
+    return VSI_SUCCESS;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CONV1D_OVXLIB_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int j = 0;
+
+    int32_t stride = vsi_nn_kernel_param_get_int32( params, "stride" );
+    int32_t pad_front = vsi_nn_kernel_param_get_int32( params, "pad_front" );
+    int32_t pad_end = vsi_nn_kernel_param_get_int32( params, "pad_end" );
+    int32_t dilation = vsi_nn_kernel_param_get_int32( params, "dilation" );
+    int32_t overflow_policy = vsi_nn_kernel_param_get_int32( params, "overflow_policy" );
+    int32_t rounding_policy = vsi_nn_kernel_param_get_int32( params, "rounding_policy" );
+    int32_t down_scale_size_rounding = vsi_nn_kernel_param_get_int32( params, "down_scale_size_rounding" );
+
+    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+    if( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _CONV1D_OVXLIB_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            j = (int)(input_num + output_num);
+            node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &stride );
+            node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &pad_front );
+            node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &pad_end );
+            node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &dilation );
+            node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &overflow_policy );
+            node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &rounding_policy );
+            node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &down_scale_size_rounding );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CONV1D_OVXLIB_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[--j] );
+            vsi_nn_kernel_scalar_release( &node_params[--j] );
+            vsi_nn_kernel_scalar_release( &node_params[--j] );
+            vsi_nn_kernel_scalar_release( &node_params[--j] );
+            vsi_nn_kernel_scalar_release( &node_params[--j] );
+            vsi_nn_kernel_scalar_release( &node_params[--j] );
+            vsi_nn_kernel_scalar_release( &node_params[--j] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( conv1d_ovxlib, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c
index 138c6e4..aa96ba3 100644
--- a/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
index 03c1711..64f9490 100644
--- a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
@@ -32,7 +32,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -46,6 +46,7 @@ typedef enum
     UNARY_NEG,
     UNARY_HSIGMOID,
     UNARY_MISH,
+    UNARY_ROUND,
 } unary_type_e;
 
 
@@ -101,6 +102,13 @@ static float mish_eval(float data)
     return data;
 }
 
+static float round_eval(float data)
+{
+    data = (float)(vsi_rtne(data));
+
+    return data;
+}
+
 DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
     (
     vsi_nn_kernel_node_t node,
@@ -165,6 +173,9 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
         case UNARY_MISH:
             data = mish_eval(data);
             break;
+        case UNARY_ROUND:
+            data = round_eval(data);
+            break;
         default:
             break;
         }
@@ -298,3 +309,4 @@ REGISTER_ELTWISE_UNARY_BACKEND_CPU( elu,          UNARY_ELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( neg,          UNARY_NEG )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( hard_sigmoid, UNARY_HSIGMOID )
 REGISTER_ELTWISE_UNARY_BACKEND_CPU( mish,         UNARY_MISH )
+REGISTER_ELTWISE_UNARY_BACKEND_CPU( round,        UNARY_ROUND )
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c b/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c
new file mode 100644
index 0000000..07f8e82
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c
@@ -0,0 +1,229 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.erf")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _erf_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _ERF_PARAM_NUM  _cnt_of_array( _erf_kernel_param_def )
+
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    float *f32_in_buffer[_INPUT_NUM] = {NULL};
+    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
+    size_t   out_elements[_OUTPUT_NUM] = {0};
+    size_t   out_bytes[_OUTPUT_NUM] = {0};
+    size_t i = 0;
+
+    /* prepare data */
+    for (i = 0; i < _INPUT_NUM; i ++)
+    {
+        input[i] = (vsi_nn_kernel_tensor_t)param[i];
+        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
+        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
+        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
+    }
+
+    for (i = 0; i < _OUTPUT_NUM; i ++)
+    {
+        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
+        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
+        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
+        out_bytes[i] = out_elements[i] * sizeof(float);
+        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
+        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
+        memset( f32_out_buffer[i], 0, out_bytes[i] );
+    }
+#define ERF_PI  3.141592653589793
+    for (i = 0; i < out_elements[0]; i ++)
+    {
+        /* 2 / sqrt(pi) * (sum[(-1)^n! * x ^ (2n + 1)] + x) */
+        float x = f32_in_buffer[0][i];
+        float res = 0;
+        float tmp = x;
+        float factorial = 1; /*n!*/
+        float x_pow = x;
+        int32_t one = 1;
+        int32_t n = 1;
+
+        while (vsi_abs(tmp) > 1e-5)
+        {
+            res += tmp;
+
+            factorial *= n;
+            one *= -1;
+            x_pow *= x * x;
+            tmp = one / factorial * x_pow / ( 2 * n + 1);
+
+            n ++;
+        }
+
+
+        res *= 2.0f / (float)sqrt(ERF_PI);
+
+        f32_out_buffer[0][i] = res;
+    }
+
+    /* save data */
+    for(i = 0; i < _OUTPUT_NUM; i++)
+    {
+        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
+            f32_out_buffer[i], out_elements[i] );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+
+final:
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        if (f32_in_buffer[i])
+        {
+            free(f32_in_buffer[i]);
+            f32_in_buffer[i] = NULL;
+        }
+        if (in_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
+        }
+    }
+    for (i = 0; i < _OUTPUT_NUM; i++)
+    {
+        if (f32_out_buffer[i])
+        {
+            free(f32_out_buffer[i]);
+            f32_out_buffer[i] = NULL;
+        }
+        if (out_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
+        }
+    }
+
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _erf_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _erf_kernel_param_def );
+    status = VSI_SUCCESS;
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_ERF_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+
+
+    status = _query_kernel( kernel, inputs, outputs);
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _ERF_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _ERF_PARAM_NUM );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( erf, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c b/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c
index c234a51..076b6b8 100644
--- a/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c b/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c
index cb22732..3d912b8 100644
--- a/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/group_normalization_cpu.c b/src/tim/vx/internal/src/kernel/cpu/group_normalization_cpu.c
new file mode 100644
index 0000000..17f45d7
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/group_normalization_cpu.c
@@ -0,0 +1,315 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _CPU_ARG_NUM            (2)
+#define _CPU_INPUT_NUM          (3)
+#define _CPU_OUTPUT_NUM         (1)
+#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
+#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
+#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.group_norm")
+
+DEF_KERNEL_EXECUTOR(_group_norm_exec)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
+    float * buffer[_CPU_IO_NUM] = { NULL };
+    size_t out_elements = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
+    uint32_t i = 0;
+    int32_t spaceOrg = 0;
+    float eps = .0f;
+
+    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
+    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
+    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
+    tensors[3]  = (vsi_nn_kernel_tensor_t)param[3];
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
+    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
+    attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
+    CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final );
+
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
+
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[4], &eps);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &spaceOrg);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
+
+    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input1 buffer fail.", final );
+
+    buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[2], "Create input1 buffer fail.", final );
+
+    buffer[3] = (float *)malloc( out_elements * sizeof(float) );
+    CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final );
+    memset( buffer[3], 0, out_elements * sizeof(float) );
+
+    {
+        uint32_t b = 0, c = 0;
+        uint32_t height = attr[0]->shape->data[1];
+        uint32_t width = attr[0]->shape->data[0];
+        uint32_t ch = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1;
+        uint32_t bh = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1;
+        uint32_t spatial = height * width;
+
+        for (b = 0; b < bh; b++)
+        {
+            for (c = 0; c < ch; c++)
+            {
+                uint32_t page = c * spatial + b * (spatial * ch);
+                uint32_t paraIdx = c * attr[1]->shape->data[0];
+                float sum = .0f;
+                float sumsq = .0f;
+                float mean = .0f;
+                float vari = .0f;
+                float data = 0;
+
+                for (i = 0; i < spatial; i++)
+                {
+                    uint32_t index = page + i;
+                    sum += buffer[0][index];
+                }
+
+                mean = sum / spatial;
+                for (i = 0; i < spatial; i++)
+                {
+                    uint32_t index = page + i;
+                    data = buffer[0][index] - mean;
+                    sumsq += data * data;
+                }
+
+                vari = sumsq / spatial;
+                vari = (float)(1.0 / sqrtf(vari + eps));
+
+                for (i = 0; i < spatial; i++)
+                {
+                    float normVal = 0;
+                    uint32_t index = page + i;
+                    uint32_t tmpIdx = paraIdx + i / spaceOrg;
+                    float scaleVal = buffer[2][tmpIdx];
+                    float biasVal = buffer[1][tmpIdx];
+
+                    data = buffer[0][index] - mean;
+                    normVal = data * vari * scaleVal + biasVal;
+                    buffer[3][index] = normVal;
+                }
+            }
+        }
+    }
+
+    status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
+            buffer[3], out_elements );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+final:
+    for( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if ( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+    }
+    for( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
+    }
+    return status;
+} /* _group_norm_exec() */
+/*
+ * Kernel params
+ */
+static vx_param_description_t _group_normalization_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _GROUP_NORMALIZATION_PARAM_NUM  _cnt_of_array( _group_normalization_kernel_param_def )
+
+static const vx_kernel_description_t _kernel_info =
+{
+    KERNEL_ID_PLACEHOLDER,
+    _KERNEL_NAME,
+    _group_norm_exec,
+    _group_normalization_kernel_param_def,
+    _cnt_of_array( _group_normalization_kernel_param_def ),
+    vsi_nn_KernelValidator,
+    NULL,
+    NULL,
+    vsi_nn_KernelInitializer,
+    vsi_nn_KernelDeinitializer
+};
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel
+    )
+{
+    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    return VSI_SUCCESS;
+} /* _query_kernel() */
+
+static int32_t _optimize_gn_shape_cpu
+    (
+    vsi_nn_tensor_t ** inputs,
+    int32_t group_size,
+    int32_t group_num,
+    int32_t* opt_shape
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    int32_t group_shape[VSI_NN_MAX_DIM_NUM] = {0};
+    int32_t new_rank = 0;
+    group_shape[0] = inputs[0]->attr.size[0];
+    group_shape[1] = inputs[0]->attr.size[1];
+    group_shape[2] = group_size;
+
+    vsi_nn_kernel_optimize_element_shape(group_shape, 3, opt_shape, &new_rank );
+
+    if (new_rank == 2)
+    {
+        opt_shape[2] = group_num;
+        opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
+    }
+    else
+    {
+        status = VSI_FAILURE;
+    }
+
+    return status;
+}
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
+    int32_t new_shape[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 };
+    int32_t group_num  = vsi_nn_kernel_param_get_int32( params, "group_num" );
+    int32_t group_size  = inputs[0]->attr.size[2] / group_num;
+    int32_t spaceOrg = inputs[0]->attr.size[0] * inputs[0]->attr.size[1];
+
+    status = _optimize_gn_shape_cpu(inputs, group_size, group_num, new_shape);
+    if ( VSI_SUCCESS != status )
+    {
+        goto final;
+    }
+    rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape, 4);
+    rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape, 4);
+
+    status = _query_kernel( inputs, outputs, kernel );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
+            uint32_t index = 0;
+            /* Set inputs and outputs */
+            backend_params[index++] = rs_input;
+            backend_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
+            backend_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
+            backend_params[index++] = rs_output;
+            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
+            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &spaceOrg );
+
+            /* Pass parameters to node. */
+            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
+            CHECK_STATUS( status );
+            vsi_nn_kernel_scalar_release( &backend_params[4] );
+            vsi_nn_kernel_scalar_release( &backend_params[5] );
+        }
+        else
+        {
+            status = VSI_FAILURE;
+        }
+    }
+final:
+    if (rs_input)
+    {
+        vsi_nn_kernel_tensor_release( &rs_input );
+    }
+    if (rs_output)
+    {
+        vsi_nn_kernel_tensor_release( &rs_output );
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( group_norm, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c b/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c
index 6720a14..c9b665c 100644
--- a/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c b/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c
index d6d9802..c8c82bf 100644
--- a/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
@@ -143,8 +143,8 @@ DEF_KERNEL_EXECUTOR(_layer_norm_exec)
                 {
                     int idx = (outer * axisSize + i) * innerSize + inner;
                     float data = buffer[0][idx] - mean;
-                    float scaleVal = buffer[2][idx];
-                    float biasVal = buffer[1][idx];
+                    float scaleVal = buffer[2][i];
+                    float biasVal = buffer[1][i];
                     float normVal = data * vari * scaleVal + biasVal;
                     buffer[3][idx] = normVal;
                 }
diff --git a/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c b/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c
index 4f56938..2ef240f 100644
--- a/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c
@@ -36,7 +36,7 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 __BEGIN_DECLS
 
 #define _CPU_ARG_NUM            (2)
diff --git a/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c b/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c
index 4e8097d..c263ff7 100644
--- a/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c b/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c
index 61bba5c..4795735 100644
--- a/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c
@@ -36,7 +36,7 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c b/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c
index 1a63797..6908a1e 100644
--- a/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c
@@ -32,7 +32,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c b/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c
index f1124bf..ad46c58 100644
--- a/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/nms_cpu.c b/src/tim/vx/internal/src/kernel/cpu/nms_cpu.c
new file mode 100644
index 0000000..8924f7b
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/nms_cpu.c
@@ -0,0 +1,441 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (2)
+#define _OUTPUT_NUM         (3)
+ #define _CPU_IO_NUM        (_INPUT_NUM + _OUTPUT_NUM)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.nms")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _nms_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define SCALAR_INPUT_MAX_SIZE          (5)
+#define SCALAR_INPUT_IOU_THRES         (6)
+#define SCALAR_INPUT_SCORE_THRES       (7)
+#define SCALAR_INPUT_SOFT_NMS_SIGMA    (8)
+#define _NMS_PARAM_NUM  _cnt_of_array( _nms_kernel_param_def )
+
+typedef struct Candidate_s
+{
+    int index;
+    float score;
+    int suppress_begin_index;
+}Candidate;
+static void _swap_element
+    (
+    Candidate* list,
+    uint32_t first,
+    uint32_t second
+    )
+{
+    Candidate temp;
+    memcpy(&temp, &list[first], sizeof(Candidate));
+    memcpy(&list[first], &list[second], sizeof(Candidate));
+    memcpy(&list[second], &temp, sizeof(Candidate));
+}
+
+static uint32_t _max_element
+    (
+    Candidate* list,
+    uint32_t len
+    )
+{
+    uint32_t i;
+    uint32_t max_index = 0;
+    float max_val = list[0].score;
+    for ( i = 1; i < len; i++ )
+    {
+        float val = list[i].score;
+        if ( max_val < val )
+        {
+            max_val = val;
+            max_index = i;
+        }
+    }
+
+    return max_index;
+}
+
+typedef struct box_corner_encoding_s
+{
+  float y1;
+  float x1;
+  float y2;
+  float x2;
+}box_corner_encoding;
+
+static float _computeIntersectionOverUnion
+    (
+    const float* boxes,
+    const int32_t i,
+    const int32_t j
+    )
+{
+  box_corner_encoding box_i = ((box_corner_encoding *)boxes)[i];
+  box_corner_encoding box_j = ((box_corner_encoding *)boxes)[j];
+  const float box_i_y_min = vsi_nn_min(box_i.y1, box_i.y2);
+  const float box_i_y_max = vsi_nn_max(box_i.y1, box_i.y2);
+  const float box_i_x_min = vsi_nn_min(box_i.x1, box_i.x2);
+  const float box_i_x_max = vsi_nn_max(box_i.x1, box_i.x2);
+  const float box_j_y_min = vsi_nn_min(box_j.y1, box_j.y2);
+  const float box_j_y_max = vsi_nn_max(box_j.y1, box_j.y2);
+  const float box_j_x_min = vsi_nn_min(box_j.x1, box_j.x2);
+  const float box_j_x_max = vsi_nn_max(box_j.x1, box_j.x2);
+
+  const float area_i =
+      (box_i_y_max - box_i_y_min) * (box_i_x_max - box_i_x_min);
+  const float area_j =
+      (box_j_y_max - box_j_y_min) * (box_j_x_max - box_j_x_min);
+  const float intersection_ymax = vsi_nn_min(box_i_y_max, box_j_y_max);
+  const float intersection_xmax = vsi_nn_min(box_i_x_max, box_j_x_max);
+  const float intersection_ymin = vsi_nn_max(box_i_y_min, box_j_y_min);
+  const float intersection_xmin = vsi_nn_max(box_i_x_min, box_j_x_min);
+  const float intersection_area =
+      vsi_nn_max(intersection_ymax - intersection_ymin, 0.0f) *
+      vsi_nn_max(intersection_xmax - intersection_xmin, 0.0f);
+
+  if (area_i <= 0 || area_j <= 0)
+  {
+      return 0.0f;
+  }
+
+  return intersection_area / (area_i + area_j - intersection_area);
+}
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VX_SUCCESS;
+    vsi_nn_kernel_tensor_t tensors[_INPUT_NUM] = { NULL };
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    float * buffer[_INPUT_NUM] = { NULL };
+    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
+    size_t stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}};
+    size_t out_elements[_OUTPUT_NUM] = {0};
+    vsi_nn_kernel_tensor_attr_t * attr[_INPUT_NUM] = { NULL };
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
+    int32_t i = 0;
+    int32_t num_boxes = 0;
+    float* boxes = NULL;
+    float* scores = NULL;
+    float* selected_indices = NULL;
+    float* selected_scores = NULL;
+    float* num_selected_indices = NULL;
+    Candidate * candidate = NULL;
+    int32_t select_size = 0;
+    int32_t max_output_size = 0;
+    int32_t select_start = 0;
+    int32_t select_len = 0;
+    float iou_threshold = 0.f;
+    float score_threshold = 0.f;
+    float soft_nms_sigma = 0.f;
+    float scale = 0;
+    int32_t num_outputs = 0;
+
+    status  = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_MAX_SIZE],
+        &max_output_size);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_IOU_THRES],
+        &iou_threshold);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_SCORE_THRES],
+        &score_threshold);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_SOFT_NMS_SIGMA],
+        &soft_nms_sigma);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    for ( i = 0;  i < _INPUT_NUM;  i++)
+    {
+        tensors[i]  = (vsi_nn_kernel_tensor_t)param[i];
+        attr[i] = vsi_nn_kernel_tensor_attr_create( tensors[i] );
+
+        vsi_nn_kernel_tensor_attr_get_stride( attr[i], stride_size[i] );
+        buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[i], attr[i], TRUE );
+        CHECK_PTR_FAIL_GOTO( buffer[i], "Create input buffer fail.", final );
+    }
+
+    for ( i = 0;  i < _OUTPUT_NUM;  i++)
+    {
+        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
+
+        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
+        f32_out_buffer[i] = (float *)malloc( out_elements[i] * sizeof(float) );
+        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
+        memset( f32_out_buffer[i], 0, out_elements[i] * sizeof(float) );
+    }
+
+    num_boxes = attr[0]->shape->data[1];
+    boxes = buffer[0];
+    scores = buffer[1];
+    selected_indices = f32_out_buffer[0];
+    selected_scores = f32_out_buffer[1];
+    num_selected_indices = f32_out_buffer[2];
+
+    candidate = (Candidate*)malloc(num_boxes * sizeof(Candidate));
+    CHECK_PTR_FAIL_GOTO( candidate, "Create select buffer fail.", final );
+    memset(candidate, 0, num_boxes * sizeof(Candidate));
+
+    for (i = 0; i < num_boxes; ++i)
+    {
+        if (scores[i] > score_threshold)
+        {
+            candidate[select_size].index = i;
+            candidate[select_size].score = scores[i];
+            candidate[select_size].suppress_begin_index = 0;
+            select_size++;
+        }
+    }
+
+    num_outputs = vsi_nn_min(select_size, max_output_size);
+
+    if (num_outputs == 0)
+    {
+        num_selected_indices[0] = 0;
+    }
+
+    if (soft_nms_sigma > 0.0f)
+    {
+        scale = -0.5f / soft_nms_sigma;
+    }
+
+    select_len = 0;
+    while (select_len < num_outputs && select_start < select_size)
+    {
+        int32_t j = 0;
+        float original_score = 0;
+        vsi_bool should_hard_suppress = FALSE;
+
+        // find max score and swap to the front.
+        int32_t max_index = _max_element( &candidate[select_start], select_size - select_start);
+
+        if (max_index != select_size - select_start - 1)
+        {
+            _swap_element(&(candidate[select_start]), max_index, 0);
+        }
+
+        original_score = candidate[select_start].score;
+        // Calculate IoU of the rest, swap to the end (disgard) if needed.
+        for ( j = select_len - 1; j >= candidate[select_start].suppress_begin_index; j-- )
+        {
+            int32_t idx = (int32_t)selected_indices[j];
+            float iou = _computeIntersectionOverUnion(boxes, candidate[select_start].index, idx);
+
+            // First decide whether to perform hard suppression.
+            if (iou >= iou_threshold)
+            {
+                should_hard_suppress = TRUE;
+                break;
+            }
+
+            // Suppress score if NMS sigma > 0.
+            if (soft_nms_sigma > 0.0)
+            {
+                candidate[select_start].score =
+                    candidate[select_start].score * (float)exp(scale * iou * iou);
+            }
+
+            if (candidate[select_start].score <= score_threshold)
+                break;
+        }
+
+        candidate[select_start].suppress_begin_index = select_len;
+        if (!should_hard_suppress)
+        {
+            if (candidate[select_start].score == original_score)
+            {
+                // Suppression has not occurred, so select next_candidate.
+                selected_indices[select_len] = (float)candidate[select_start].index;
+                selected_scores[select_len] = candidate[select_start].score;
+                ++ select_len;
+            }
+            if ( candidate[select_start].score > score_threshold)
+            {
+                // Soft suppression might have occurred and current score is still
+                // greater than score_threshold; add next_candidate back onto priority
+                // queue.
+                candidate[select_start].suppress_begin_index = select_len;
+            }
+        }
+
+        select_start ++;
+    }
+
+    num_selected_indices[0] = (float)select_len;
+
+    for ( i = select_len; i < max_output_size; i++)
+    {
+        selected_indices[i] = 0;
+        selected_scores[i] = 0;
+    }
+
+    /* save data */
+    for ( i = 0; i < _OUTPUT_NUM; i++ )
+    {
+        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
+                f32_out_buffer[i], out_elements[i] );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+
+final:
+    vsi_nn_safe_free(candidate);
+    for( i = 0; i < _INPUT_NUM; i ++ )
+    {
+        if ( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+        vsi_nn_kernel_tensor_attr_release( &attr[i] );
+    }
+
+    for ( i = 0; i < _OUTPUT_NUM; i++ )
+    {
+        if (f32_out_buffer[i])
+        {
+            free(f32_out_buffer[i]);
+            f32_out_buffer[i] = NULL;
+        }
+        if (out_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
+        }
+    }
+
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _nms_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _nms_kernel_param_def );
+    status = VSI_SUCCESS;
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_NMS_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t max_output_size = vsi_nn_kernel_param_get_int32(params, "max_output_size");
+    float iou_threshold = vsi_nn_kernel_param_get_float32(params, "iou_threshold");
+    float score_threshold = vsi_nn_kernel_param_get_float32(params, "score_threshold");
+    float soft_nms_sigma = vsi_nn_kernel_param_get_float32(params, "soft_nms_sigma");
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _NMS_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            node_params[SCALAR_INPUT_MAX_SIZE] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &max_output_size );
+            node_params[SCALAR_INPUT_IOU_THRES] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &iou_threshold );
+            node_params[SCALAR_INPUT_SCORE_THRES] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &score_threshold );
+            node_params[SCALAR_INPUT_SOFT_NMS_SIGMA] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &soft_nms_sigma );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _NMS_PARAM_NUM );
+
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_MAX_SIZE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_IOU_THRES] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCORE_THRES] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SOFT_NMS_SIGMA] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( nms, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/one_hot_cpu.c b/src/tim/vx/internal/src/kernel/cpu/one_hot_cpu.c
new file mode 100644
index 0000000..6a46178
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/one_hot_cpu.c
@@ -0,0 +1,252 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+ #define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.one_hot")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _one_hot_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+
+#define INPUT_SCALAR_DEPTH          (2)
+#define INPUT_SCALAR_ON_VALUE       (3)
+#define INPUT_SCALAR_OFF_VALUE      (4)
+#define INPUT_SCALAR_AXIS           (5)
+#define _ONE_HOT_PARAM_NUM  _cnt_of_array( _one_hot_kernel_param_def )
+
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t tensors[_IO_NUM] = { NULL };
+    float * buffer[_IO_NUM] = { NULL };
+    size_t out_elements = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[_IO_NUM] = { NULL };
+    int32_t i = 0;
+    int32_t j = 0;
+    int32_t k = 0;
+    int32_t index = 0;
+    int32_t depth = 0;
+    float on_value = 0;
+    float off_value = 0;
+    int32_t axis = 0;
+    int32_t prefix_dim_size = 1;
+    int32_t suffix_dim_size = 0;
+    int32_t num_elements = 0;
+
+    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
+    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    status  = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &depth);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[3], &on_value);
+    status |= vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[4], &off_value);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &axis);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    num_elements = (int32_t)vsi_nn_kernel_tensor_attr_get_size( attr[0] );
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
+
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
+    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
+    memset( buffer[1], 0, out_elements * sizeof(float) );
+
+    axis = axis == -1 ? (int32_t)attr[0]->shape->size : (int32_t)attr[0]->shape->size - axis;
+
+    for (i = 0; i < axis; i++)
+    {
+        prefix_dim_size *= attr[0]->shape->data[i];
+    }
+
+    suffix_dim_size = num_elements / prefix_dim_size;
+
+    for (i = 0; i < prefix_dim_size; i++)
+    {
+        for (j = 0; j < depth; j++)
+        {
+            for (k = 0; k < suffix_dim_size; k++)
+            {
+                int32_t value = (int32_t)buffer[0][i * suffix_dim_size + k];
+                buffer[1][index ++] = value == j ? on_value : off_value;
+            }
+        }
+    }
+
+    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
+            buffer[1], out_elements );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(attr[0]);
+    SAFE_FREE_TENSOR_ATTR(attr[1]);
+#undef SAFE_FREE_TENSOR_ATTR
+    for ( i = 0; i < _IO_NUM; i ++ )
+    {
+        if ( buffer[i] )
+        {
+            free( buffer[i] );
+            buffer[i] = NULL;
+        }
+    }
+
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _one_hot_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _one_hot_kernel_param_def );
+    status = VSI_SUCCESS;
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_ONE_HOT_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t depth = vsi_nn_kernel_param_get_int32( params, "depth" );
+    float on_value = vsi_nn_kernel_param_get_float32( params, "on_value" );
+    float off_value = vsi_nn_kernel_param_get_float32( params, "off_value" );
+    int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
+
+    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _ONE_HOT_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[INPUT_SCALAR_DEPTH] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &depth );
+            node_params[INPUT_SCALAR_ON_VALUE] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &on_value );
+            node_params[INPUT_SCALAR_OFF_VALUE] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &off_value );
+            node_params[INPUT_SCALAR_AXIS] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &axis );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _ONE_HOT_PARAM_NUM );
+            CHECK_STATUS_FAIL_GOTO( status, OnError );
+        }
+    }
+OnError:
+    if (node_params[INPUT_SCALAR_DEPTH])
+    {
+        vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_DEPTH] );
+    }
+
+    if (node_params[INPUT_SCALAR_ON_VALUE])
+    {
+        vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_ON_VALUE] );
+    }
+
+    if (node_params[INPUT_SCALAR_OFF_VALUE])
+    {
+        vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_OFF_VALUE] );
+    }
+
+    if (node_params[INPUT_SCALAR_AXIS])
+    {
+        vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_AXIS] );
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( one_hot, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c
index 1f7c2eb..902d40e 100644
--- a/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c
@@ -32,7 +32,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c
index 3be8fc9..d31f2fc 100644
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c
@@ -32,7 +32,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c
index 644add0..c615f68 100644
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c
@@ -32,7 +32,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c
index 2417d0e..1e4d48d 100644
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c
@@ -32,7 +32,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c
index 2be7273..972172f 100644
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c
@@ -32,7 +32,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c
index 6749f29..8132778 100644
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c
@@ -32,7 +32,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c
index 6894957..a19e5ae 100644
--- a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c
@@ -32,7 +32,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c b/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c
index fa433dc..b7e97c2 100644
--- a/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c
@@ -32,7 +32,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c b/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c
index 3b21033..15d1b51 100644
--- a/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c
@@ -38,7 +38,7 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/repeat_cpu.c b/src/tim/vx/internal/src/kernel/cpu/repeat_cpu.c
new file mode 100644
index 0000000..ceb1684
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/repeat_cpu.c
@@ -0,0 +1,286 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _CPU_ARG_NUM            (1)
+#define _CPU_INPUT_NUM          (2)
+#define _CPU_OUTPUT_NUM         (1)
+#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
+#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
+#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.repeat")
+
+DEF_KERNEL_EXECUTOR(_repeat_exec)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
+    float * buffer[_CPU_IO_NUM] = { NULL };
+    size_t out_elements = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
+    int32_t i = 0, j = 0, b = 0, c = 0;
+    int32_t axis = 0;
+    int32_t outerSize = 1;
+    int32_t outIdx = 0;
+    int32_t width = 0, height = 0, channel = 0, batch = 0;
+    int32_t spatial = 0, vol = 0;
+
+    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
+    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
+    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
+    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
+
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &axis);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
+
+    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input0 buffer fail.", final );
+
+    buffer[2] = (float *)malloc( out_elements * sizeof(float) );
+    CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
+    memset( buffer[2], 0, out_elements * sizeof(float) );
+
+    width   = attr[0]->shape->data[0];
+    height  = attr[0]->shape->data[1];
+    channel = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1;
+    batch   = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1;
+    spatial = width * height;
+    vol     = spatial * channel;
+
+    for(i = 1; i < (int32_t)attr[0]->shape->size; i++)
+    {
+        outerSize *= attr[0]->shape->data[i];
+    }
+
+    if (axis == 0 && outerSize == 1)
+    {
+        for(i = 0; i < width; i++)
+        {
+            float data = buffer[0][i];
+            int32_t len = (int32_t)buffer[1][i];
+            for(j = 0; j < len; j++)
+            {
+                buffer[2][outIdx] = data;
+            }
+        }
+    }
+    else if (axis == 0)
+    {
+        for(b = 0; b < batch; b++)
+        {
+            for(c = 0; c < channel; c++)
+            {
+                for(i = 0; i < height; i++)
+                {
+                    int32_t len = (int32_t)buffer[1][i];
+                    int32_t offset = i * width + c * spatial + b * vol;
+                    for(j = 0; j < len; j++)
+                    {
+                        memcpy(buffer[2] + outIdx, buffer[0] + offset, sizeof(float) * width);
+                        outIdx += width;
+                    }
+                }
+            }
+        }
+    }
+    else if (axis == 1)
+    {
+        for(b = 0; b < batch; b++)
+        {
+            for(c = 0; c < channel; c++)
+            {
+                for(i = 0; i < height; i++)
+                {
+                    int32_t offset = i * width + c * spatial + b * vol;
+                    for(j = 0; j < width; j++)
+                    {
+                        int32_t len = (int32_t)buffer[1][j];
+                        float data = buffer[0][offset + j];
+                        int32_t k = 0;
+                        for(k = 0; k < len; k++)
+                        {
+                            buffer[2][outIdx++] = data;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    else if (axis == 2)
+    {
+        for(b = 0; b < batch; b++)
+        {
+            for(c = 0; c < channel; c++)
+            {
+                int32_t len = (int32_t)buffer[1][c];
+                int32_t offset = c * spatial + b * vol;
+
+                for(j = 0; j < len; j++)
+                {
+                    memcpy(buffer[2] + outIdx, buffer[0] + offset, sizeof(float) * spatial);
+                    outIdx += spatial;
+                }
+            }
+        }
+    }
+    else
+    {
+        VSILOGE("axis is not support");
+        status = VSI_FAILURE;
+        goto final;
+    }
+
+    status = vsi_nn_kernel_tensor_write_from_float( tensors[2], attr[2],
+            buffer[2], out_elements );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+final:
+    for( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+    }
+    for( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
+    }
+    return status;
+} /* _repeat_exec() */
+/*
+ * Kernel params
+ */
+static vx_param_description_t _repeat_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _REPEAT_PARAM_NUM  _cnt_of_array( _repeat_kernel_param_def )
+
+static const vx_kernel_description_t _kernel_info =
+{
+    KERNEL_ID_PLACEHOLDER,
+    _KERNEL_NAME,
+    _repeat_exec,
+    _repeat_kernel_param_def,
+    _cnt_of_array( _repeat_kernel_param_def ),
+    vsi_nn_KernelValidator,
+    NULL,
+    NULL,
+    vsi_nn_KernelInitializer,
+    vsi_nn_KernelDeinitializer
+};
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel
+    )
+{
+    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    return VSI_SUCCESS;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t axis  = vsi_nn_kernel_param_get_int32( params, "axis" );
+
+    status = _query_kernel( inputs, outputs, kernel );
+    if( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( backend_params, _CPU_PARAM_NUM,
+                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
+            backend_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
+
+            /* Pass parameters to node. */
+            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
+            CHECK_STATUS( status );
+            vsi_nn_kernel_scalar_release( &backend_params[3] );
+        }
+        else
+        {
+            status = VSI_FAILURE;
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( repeat, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cpu/scatter_nd_cpu.c b/src/tim/vx/internal/src/kernel/cpu/scatter_nd_cpu.c
index 1369867..62c7ff0 100644
--- a/src/tim/vx/internal/src/kernel/cpu/scatter_nd_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/scatter_nd_cpu.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/sequence_mask_cpu.c b/src/tim/vx/internal/src/kernel/cpu/sequence_mask_cpu.c
new file mode 100644
index 0000000..9790537
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/sequence_mask_cpu.c
@@ -0,0 +1,248 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
+#include "libnnext/vsi_nn_vxkernel.h"
+
+__BEGIN_DECLS
+
+#define _CPU_ARG_NUM            (1)
+#define _CPU_INPUT_NUM          (1)
+#define _CPU_OUTPUT_NUM         (1)
+#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
+#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
+#define _KERNEL_NAME            CVIVANTE_NAMESPACE("sequence_mask_sw")
+
+DEF_KERNEL_EXECUTOR(_sequence_mask_exec)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VX_SUCCESS;
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
+    float * buffer_in = NULL;
+    float * buffer = NULL;
+    size_t out_elements = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
+    uint32_t i = 0;
+
+    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
+    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
+
+    buffer_in = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer_in, "Create input0 buffer fail.", final );
+
+    buffer = (float *)malloc( out_elements * sizeof(float) );
+    CHECK_PTR_FAIL_GOTO( buffer, "Create output buffer fail.", final );
+    memset( buffer, 0, out_elements * sizeof(float) );
+
+    {
+        uint32_t j = 0;
+        uint32_t height = attr[1]->shape->data[1];
+        uint32_t width = attr[1]->shape->data[0];
+
+        for(j = 0; j < height; j++)
+        {
+            uint32_t idx_in = (uint32_t)buffer_in[j];
+            uint32_t out_offset = j * width;
+            idx_in = idx_in > width ? width : idx_in;
+            for(i = 0; i < idx_in; i++)
+            {
+                buffer[out_offset + i] = 1;
+            }
+        }
+    }
+
+    status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
+            buffer, out_elements );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+final:
+    if (buffer_in)
+    {
+        free( buffer_in );
+    }
+    if (buffer)
+    {
+        free( buffer );
+    }
+    for( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[i] );
+    }
+    return status;
+} /* _sequence_mask_exec() */
+
+static vx_param_description_t kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+
+
+static const vx_kernel_description_t _kernel_info =
+{
+    KERNEL_ID_PLACEHOLDER,
+    _KERNEL_NAME,
+    _sequence_mask_exec,
+    kernel_param_def,
+    _cnt_of_array( kernel_param_def ),
+    vsi_nn_KernelValidator,
+    NULL,
+    NULL,
+    vsi_nn_KernelInitializer,
+    vsi_nn_KernelDeinitializer
+};
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel
+    )
+{
+    memmove( &kernel->info, &_kernel_info, sizeof(vx_kernel_description_t) );
+    return VSI_SUCCESS;
+} /* _query_kernel() */
+
+static int32_t _optimize_mask_shape
+    (
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    int32_t max_len,
+    int32_t* opt_shape_in,
+    int32_t* opt_shape_out
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    int32_t out_size = 1;
+    uint32_t i = 0;
+    opt_shape_in[0] = 1;
+    opt_shape_in[1] = 1;
+    for(i = 0; i < inputs[0]->attr.dim_num; i++)
+    {
+        opt_shape_in[0] *= inputs[0]->attr.size[i];
+    }
+
+    for(i = 0; i < outputs[0]->attr.dim_num; i++)
+    {
+        out_size *= outputs[0]->attr.size[i];
+    }
+
+    opt_shape_out[0] = max_len;
+    opt_shape_out[1] = out_size / max_len;
+
+    if (out_size % max_len != 0)
+    {
+        return VSI_FAILURE;
+    }
+
+    return status;
+}
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
+    int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }};
+    int32_t max_len  = vsi_nn_kernel_param_get_int32( params, "max_len" );
+
+    status = _optimize_mask_shape(inputs, outputs, max_len, new_shape[0], new_shape[1]);
+    if ( VSI_SUCCESS != status )
+    {
+        goto final;
+    }
+    rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape[0], 2);
+    rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[1], 2);
+
+    status = _query_kernel( inputs, outputs, kernel );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            uint32_t index = 0;
+            /* Pass parameters to node. */
+            backend_params[index++] = rs_input;
+            backend_params[index++] = rs_output;
+            backend_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &max_len );
+            /* Pass parameters to node. */
+            status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
+            CHECK_STATUS(status);
+            vsi_nn_kernel_scalar_release( &backend_params[2] );
+        }
+        else
+        {
+            status = VSI_FAILURE;
+        }
+    }
+final:
+    if (rs_input)
+    {
+        vsi_nn_kernel_tensor_release( &rs_input );
+    }
+    if (rs_output)
+    {
+        vsi_nn_kernel_tensor_release( &rs_output );
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( sequence_mask, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cpu/slice_cpu.c b/src/tim/vx/internal/src/kernel/cpu/slice_cpu.c
new file mode 100644
index 0000000..8307152
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/slice_cpu.c
@@ -0,0 +1,246 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+    /*
+    * Define kernel meta.
+    */
+#define _INPUT_NUM          (2)
+#define _OUTPUT_NUM         (1)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.slice")
+
+
+    /*
+    * Kernel params
+    */
+    static vx_param_description_t _slice_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _SLICE_PARAM_NUM  _cnt_of_array( _slice_kernel_param_def )
+
+
+/*
+* Kernel function
+*/
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    float *f32_in_buffer[_INPUT_NUM] = {NULL};
+    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
+    size_t   out_elements[_OUTPUT_NUM] = {0};
+    size_t   out_bytes[_OUTPUT_NUM] = {0};
+    int32_t  rank = 0;
+    int32_t  i = 0;
+    int32_t  in_w = 0;
+    int32_t  in_h = 0;
+    int32_t  in_c = 0;
+    int32_t  in_b = 0;
+    int32_t start[4] = {0};
+    int32_t stop[4] = {0};
+    int32_t in_size[4] = {1, 1, 1, 1};
+    int32_t out_size[4] = {1, 1, 1, 1};
+    float *input_ptr = NULL;
+    float *output_ptr = NULL;
+    int32_t dstIdx = 0;
+
+    /* prepare data */
+    for (i = 0; i < _INPUT_NUM; i ++)
+    {
+        input[i] = (vsi_nn_kernel_tensor_t)param[i];
+        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
+        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
+        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
+    }
+
+    for (i = 0; i < _OUTPUT_NUM; i ++)
+    {
+        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
+        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
+        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
+        out_bytes[i] = out_elements[i] * sizeof(float);
+        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
+        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
+        memset( f32_out_buffer[i], 0, out_bytes[i] );
+    }
+
+    rank = (int32_t)out_attr[0]->shape->size;
+
+    for (i = 0; i < rank; i++)
+    {
+        in_size[i] = in_attr[0]->shape->data[i];
+        out_size[i] = out_attr[0]->shape->data[i];
+    }
+
+    start[0] = (int32_t)f32_in_buffer[1][0];
+    stop[0] = start[0] + out_attr[0]->shape->data[0];
+    start[1] = rank < 2 ? 0 : (int32_t)f32_in_buffer[1][1];
+    stop[1] = rank < 2 ? 1 : start[1] + out_size[1];
+    start[2] = rank < 3 ? 0 : (int32_t)f32_in_buffer[1][2];
+    stop[2] = rank < 3 ? 1 : start[2] + out_size[2];
+    start[3] = rank < 4 ? 0 : (int32_t)f32_in_buffer[1][3];
+    stop[3] = rank < 4 ? 1 : start[3] + out_size[3];
+    input_ptr = f32_in_buffer[0];
+    output_ptr = f32_out_buffer[0];
+
+    for (in_b = start[3]; in_b < stop[3]; ++in_b)
+    {
+        for (in_c = start[2]; in_c < stop[2]; ++in_c)
+        {
+            for (in_h = start[1]; in_h < stop[1]; ++in_h)
+            {
+                for (in_w = start[0]; in_w < stop[0]; ++in_w)
+                {
+                    int32_t srcIdx = ((in_b * in_size[2] + in_c) * in_size[1] + in_h) * in_size[0] + in_w;
+                    output_ptr[dstIdx ++] = input_ptr[srcIdx];
+                }
+            }
+        }
+    }
+
+    /* save data */
+    for(i = 0; i < _OUTPUT_NUM; i++)
+    {
+        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
+            f32_out_buffer[i], out_elements[i] );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+
+final:
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        if (f32_in_buffer[i])
+        {
+            free(f32_in_buffer[i]);
+            f32_in_buffer[i] = NULL;
+        }
+        if (in_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
+        }
+    }
+    for (i = 0; i < _OUTPUT_NUM; i++)
+    {
+        if (f32_out_buffer[i])
+        {
+            free(f32_out_buffer[i]);
+            f32_out_buffer[i] = NULL;
+        }
+        if (out_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
+        }
+    }
+
+    return status;
+} /* _compute() */
+
+
+/*
+* Query kernel
+*/
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _slice_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _slice_kernel_param_def );
+    status = VSI_SUCCESS;
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_SLICE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+
+    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _SLICE_PARAM_NUM,
+                inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _SLICE_PARAM_NUM );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( slice, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c
index 4df8a52..a9170c7 100644
--- a/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/tile_cpu.c b/src/tim/vx/internal/src/kernel/cpu/tile_cpu.c
index 63c2f4c..90729c7 100644
--- a/src/tim/vx/internal/src/kernel/cpu/tile_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/tile_cpu.c
@@ -32,7 +32,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 __BEGIN_DECLS
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c b/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c
new file mode 100644
index 0000000..a2062c8
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c
@@ -0,0 +1,297 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (2)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.topk")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _topk_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+    // Add kererl parameters here
+};
+#define _TOPK_PARAM_NUM  _cnt_of_array( _topk_kernel_param_def )
+
+static uint32_t _max_comp_func(void* data, int32_t left, int32_t right)
+{
+    float* fdata = (float*)data;
+    if (fdata[left] >= fdata[right])
+    {
+        return TRUE;
+    }
+    else
+    {
+        return FALSE;
+    }
+}
+
+static void _find_top_k_1d
+(
+    float* input,
+    uint32_t input_len,
+    uint32_t k,
+    float* value,
+    uint32_t* indices
+)
+{
+    int32_t low = 0;
+    int32_t high = input_len - 1;
+    int32_t j;
+
+    for (j = 0; j < (int32_t)input_len; j++)
+    {
+        indices[j] = j;
+    }
+
+    j = vsi_nn_partition(input, low, high, _max_comp_func, FALSE, indices);
+
+    //part_sort
+    while (j != (int32_t)k)
+    {
+        if ((int32_t)k > j)
+        {
+            low = j + 1;
+        }
+        else
+        {
+            high = j;
+        }
+        j = vsi_nn_partition(input, low, high, _max_comp_func, FALSE, indices);
+    }
+    //all_sort
+    vsi_nn_partition(input, 0, k - 1, _max_comp_func, TRUE, indices);
+
+    for (j = 0; j < (int32_t)k; j++)
+    {
+        value[j] = input[indices[j]];
+    }
+}
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    float *f32_in_buffer[_INPUT_NUM] = {NULL};
+    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
+    size_t   out_elements[_OUTPUT_NUM] = {0};
+    size_t   out_bytes[_OUTPUT_NUM] = {0};
+    uint32_t  i = 0;
+    int32_t  j = 0;
+    int32_t  top_k = 0;
+    uint32_t block_num = 0;
+    uint32_t block_size = 0;
+    uint32_t * indices_ptr = NULL;
+
+    /* prepare data */
+    for (i = 0; i < _INPUT_NUM; i ++)
+    {
+        input[i] = (vsi_nn_kernel_tensor_t)param[i];
+        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
+        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
+        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
+    }
+
+    for (i = 0; i < _OUTPUT_NUM; i ++)
+    {
+        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
+        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
+        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
+        out_bytes[i] = out_elements[i] * sizeof(float);
+        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
+        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
+        memset( f32_out_buffer[i], 0, out_bytes[i] );
+    }
+
+    status = vsi_nn_kernel_scalar_read_int32( param[3], &top_k );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    block_num = in_attr[0]->shape->data[1];
+    block_size = in_attr[0]->shape->data[0];
+    indices_ptr = (uint32_t*)malloc(block_size * sizeof(uint32_t));
+    CHECK_PTR_FAIL_GOTO( indices_ptr, "Create indices buffer fail.", final );
+
+    for(i = 0; i < block_num; i++)
+    {
+        uint32_t in_index = i * block_size;
+        uint32_t out_index = i * top_k;
+        _find_top_k_1d(&(f32_in_buffer[0][in_index]),
+            block_size, top_k, &(f32_out_buffer[0][out_index]), indices_ptr);
+
+        for (j = 0; j < top_k; j++)
+        {
+            f32_out_buffer[1][out_index + j] = (float)indices_ptr[j];
+        }
+    }
+    // Handle the 1D input
+    if (!block_num)
+    {
+        _find_top_k_1d(&(f32_in_buffer[0][0]),
+            block_size, top_k, &(f32_out_buffer[0][0]), indices_ptr);
+        for (j = 0; j < top_k; j++)
+        {
+            f32_out_buffer[1][j] = (float)indices_ptr[j];
+        }
+    }
+
+    /* save data */
+    for(i = 0; i < _OUTPUT_NUM; i++)
+    {
+        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
+                f32_out_buffer[i], out_elements[i] );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+
+final:
+    vsi_nn_safe_free(indices_ptr);
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        if (f32_in_buffer[i])
+        {
+            free(f32_in_buffer[i]);
+            f32_in_buffer[i] = NULL;
+        }
+        if (in_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
+        }
+    }
+    for (i = 0; i < _OUTPUT_NUM; i++)
+    {
+        if (f32_out_buffer[i])
+        {
+            free(f32_out_buffer[i]);
+            f32_out_buffer[i] = NULL;
+        }
+        if (out_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
+        }
+    }
+
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _topk_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _topk_kernel_param_def );
+    status = VSI_SUCCESS;
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_TOPK_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
+
+    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _TOPK_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &top_k );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _TOPK_PARAM_NUM );
+
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( topk, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c b/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c
index ec0213c..13b2a6a 100644
--- a/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c
@@ -44,23 +44,26 @@ typedef enum _internal_img_dim_e
     IMAGE_2D,
 } internal_img_dim_e;
 
-#define _BATCH_NORM_KERNEL_SOURCE      "batchnorm_single"
+#define SOURCE0      "batchnorm_single"
+#define SOURCE1      "batchnorm_single_f32"
 
 #define STR(a) #a
 
 // Add kernel hashtable here
-#define BATCH_NORM_HASH_KEY(IN_DTYPE, OUT_DTYPE, BRDCST, _image_2d) \
-        ( ( IN_DTYPE << 16 ) | ( OUT_DTYPE << 3) | ( BRDCST << 1) | (_image_2d) )
+#define BATCH_NORM_HASH_KEY(IN_DTYPE, GAMMA_TYPE, OUT_DTYPE, BRDCST, _image_2d) \
+        ( ( IN_DTYPE << 24 ) | ( GAMMA_TYPE << 16 ) | ( OUT_DTYPE << 3) | ( BRDCST << 1) | (_image_2d) )
 
-#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, BRDCST) \
-        { BATCH_NORM_HASH_KEY( IN_DTYPE, OUT_DTYPE, BRDCST, IMAGE), \
-        CVIVANTE_NAMESPACE("evis.batch_norm_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_brdcst"#BRDCST), \
-        _BATCH_NORM_KERNEL_SOURCE}
+#define PACK_KERNEL_MAP( IN_DTYPE, GAMMA_TYPE, OUT_DTYPE, BRDCST, source) \
+        { BATCH_NORM_HASH_KEY( IN_DTYPE, GAMMA_TYPE, OUT_DTYPE, BRDCST, IMAGE), \
+        CVIVANTE_NAMESPACE("evis.batch_norm_"STR(IN_DTYPE)"_F16_F16_" \
+        STR(GAMMA_TYPE)"_F32to"STR(OUT_DTYPE)"_brdcst"#BRDCST), \
+        source}
 
-#define PACK_KERNEL_MAP_2D( IN_DTYPE, OUT_DTYPE, BRDCST) \
-        { BATCH_NORM_HASH_KEY( IN_DTYPE, OUT_DTYPE, BRDCST, IMAGE_2D), \
-        CVIVANTE_NAMESPACE("evis.batch_norm_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_brdcst"#BRDCST"_2D"), \
-        _BATCH_NORM_KERNEL_SOURCE}
+#define PACK_KERNEL_MAP_2D( IN_DTYPE, GAMMA_TYPE, OUT_DTYPE, BRDCST, source) \
+        { BATCH_NORM_HASH_KEY( IN_DTYPE, GAMMA_TYPE, OUT_DTYPE, BRDCST, IMAGE_2D), \
+          CVIVANTE_NAMESPACE("evis.batch_norm_"STR(IN_DTYPE)"_F16_F16_" \
+          STR(GAMMA_TYPE)"_F32to"STR(OUT_DTYPE)"_brdcst"#BRDCST"_2D"), \
+          source}
 
 typedef struct
 {
@@ -71,47 +74,89 @@ typedef struct
 
 static const _kernel_map_type _batch_norm_kernel_map[] =
 {
-    PACK_KERNEL_MAP(F16, F16, 0),
-    PACK_KERNEL_MAP(F16, I16, 0),
-    PACK_KERNEL_MAP(F16, U8,  0),
-    PACK_KERNEL_MAP(F16, I8,  0),
-    PACK_KERNEL_MAP(U8,  U8,  0),
-    PACK_KERNEL_MAP(U8,  F16, 0),
-    PACK_KERNEL_MAP(I8,  I8,  0),
-    PACK_KERNEL_MAP(I8,  F16, 0),
-    PACK_KERNEL_MAP(I16, I16, 0),
-    PACK_KERNEL_MAP(I16, F16, 0),
-    PACK_KERNEL_MAP(F16, F16, 1),
-    PACK_KERNEL_MAP(F16, I16, 1),
-    PACK_KERNEL_MAP(F16, U8,  1),
-    PACK_KERNEL_MAP(F16, I8,  1),
-    PACK_KERNEL_MAP(U8,  U8,  1),
-    PACK_KERNEL_MAP(U8,  F16, 1),
-    PACK_KERNEL_MAP(I8,  I8,  1),
-    PACK_KERNEL_MAP(I8,  F16, 1),
-    PACK_KERNEL_MAP(I16, I16, 1),
-    PACK_KERNEL_MAP(I16, F16, 1),
+    PACK_KERNEL_MAP(F16, F16, F16, 0, SOURCE0),
+    PACK_KERNEL_MAP(F16, F16, I16, 0, SOURCE0),
+    PACK_KERNEL_MAP(F16, F16, U8,  0, SOURCE0),
+    PACK_KERNEL_MAP(F16, F16, I8,  0, SOURCE0),
+    PACK_KERNEL_MAP(U8,  F16, U8,  0, SOURCE0),
+    PACK_KERNEL_MAP(U8,  F16, F16, 0, SOURCE0),
+    PACK_KERNEL_MAP(I8,  F16, I8,  0, SOURCE0),
+    PACK_KERNEL_MAP(I8,  F16, F16, 0, SOURCE0),
+    PACK_KERNEL_MAP(I16, F16, I16, 0, SOURCE0),
+    PACK_KERNEL_MAP(I16, F16, F16, 0, SOURCE0),
+    PACK_KERNEL_MAP(F16, F16, F16, 1, SOURCE0),
+    PACK_KERNEL_MAP(F16, F16, I16, 1, SOURCE0),
+    PACK_KERNEL_MAP(F16, F16, U8,  1, SOURCE0),
+    PACK_KERNEL_MAP(F16, F16, I8,  1, SOURCE0),
+    PACK_KERNEL_MAP(U8,  F16, U8,  1, SOURCE0),
+    PACK_KERNEL_MAP(U8,  F16, F16, 1, SOURCE0),
+    PACK_KERNEL_MAP(I8,  F16, I8,  1, SOURCE0),
+    PACK_KERNEL_MAP(I8,  F16, F16, 1, SOURCE0),
+    PACK_KERNEL_MAP(I16, F16, I16, 1, SOURCE0),
+    PACK_KERNEL_MAP(I16, F16, F16, 1, SOURCE0),
 
-    PACK_KERNEL_MAP_2D(F16, F16, 0),
-    PACK_KERNEL_MAP_2D(F16, I16, 0),
-    PACK_KERNEL_MAP_2D(F16, U8 , 0),
-    PACK_KERNEL_MAP_2D(F16, I8 , 0),
-    PACK_KERNEL_MAP_2D(U8,  U8 , 0),
-    PACK_KERNEL_MAP_2D(U8,  F16, 0),
-    PACK_KERNEL_MAP_2D(I8,  I8,  0),
-    PACK_KERNEL_MAP_2D(I8,  F16, 0),
-    PACK_KERNEL_MAP_2D(I16, I16, 0),
-    PACK_KERNEL_MAP_2D(I16, F16, 0),
-    PACK_KERNEL_MAP_2D(F16, F16, 1),
-    PACK_KERNEL_MAP_2D(F16, I16, 1),
-    PACK_KERNEL_MAP_2D(F16, U8 , 1),
-    PACK_KERNEL_MAP_2D(F16, I8 , 1),
-    PACK_KERNEL_MAP_2D(U8,  U8 , 1),
-    PACK_KERNEL_MAP_2D(U8,  F16, 1),
-    PACK_KERNEL_MAP_2D(I8,  I8,  1),
-    PACK_KERNEL_MAP_2D(I8,  F16, 1),
-    PACK_KERNEL_MAP_2D(I16, I16, 1),
-    PACK_KERNEL_MAP_2D(I16, F16, 1),
+    PACK_KERNEL_MAP(F16, F32, F16, 0, SOURCE1),
+    PACK_KERNEL_MAP(F16, F32, I16, 0, SOURCE1),
+    PACK_KERNEL_MAP(F16, F32, U8,  0, SOURCE1),
+    PACK_KERNEL_MAP(F16, F32, I8,  0, SOURCE1),
+    PACK_KERNEL_MAP(U8,  F32, U8,  0, SOURCE1),
+    PACK_KERNEL_MAP(U8,  F32, F16, 0, SOURCE1),
+    PACK_KERNEL_MAP(I8,  F32, I8,  0, SOURCE1),
+    PACK_KERNEL_MAP(I8,  F32, F16, 0, SOURCE1),
+    PACK_KERNEL_MAP(I16, F32, I16, 0, SOURCE1),
+    PACK_KERNEL_MAP(I16, F32, F16, 0, SOURCE1),
+    PACK_KERNEL_MAP(F16, F32, F16, 1, SOURCE1),
+    PACK_KERNEL_MAP(F16, F32, I16, 1, SOURCE1),
+    PACK_KERNEL_MAP(F16, F32, U8,  1, SOURCE1),
+    PACK_KERNEL_MAP(F16, F32, I8,  1, SOURCE1),
+    PACK_KERNEL_MAP(U8,  F32, U8,  1, SOURCE1),
+    PACK_KERNEL_MAP(U8,  F32, F16, 1, SOURCE1),
+    PACK_KERNEL_MAP(I8,  F32, I8,  1, SOURCE1),
+    PACK_KERNEL_MAP(I8,  F32, F16, 1, SOURCE1),
+    PACK_KERNEL_MAP(I16, F32, I16, 1, SOURCE1),
+    PACK_KERNEL_MAP(I16, F32, F16, 1, SOURCE1),
+
+    PACK_KERNEL_MAP_2D(F16, F16, F16, 0, SOURCE0),
+    PACK_KERNEL_MAP_2D(F16, F16, I16, 0, SOURCE0),
+    PACK_KERNEL_MAP_2D(F16, F16, U8,  0, SOURCE0),
+    PACK_KERNEL_MAP_2D(F16, F16, I8,  0, SOURCE0),
+    PACK_KERNEL_MAP_2D(U8,  F16, U8,  0, SOURCE0),
+    PACK_KERNEL_MAP_2D(U8,  F16, F16, 0, SOURCE0),
+    PACK_KERNEL_MAP_2D(I8,  F16, I8,  0, SOURCE0),
+    PACK_KERNEL_MAP_2D(I8,  F16, F16, 0, SOURCE0),
+    PACK_KERNEL_MAP_2D(I16, F16, I16, 0, SOURCE0),
+    PACK_KERNEL_MAP_2D(I16, F16, F16, 0, SOURCE0),
+    PACK_KERNEL_MAP_2D(F16, F16, F16, 1, SOURCE0),
+    PACK_KERNEL_MAP_2D(F16, F16, I16, 1, SOURCE0),
+    PACK_KERNEL_MAP_2D(F16, F16, U8,  1, SOURCE0),
+    PACK_KERNEL_MAP_2D(F16, F16, I8,  1, SOURCE0),
+    PACK_KERNEL_MAP_2D(U8,  F16, U8,  1, SOURCE0),
+    PACK_KERNEL_MAP_2D(U8,  F16, F16, 1, SOURCE0),
+    PACK_KERNEL_MAP_2D(I8,  F16, I8,  1, SOURCE0),
+    PACK_KERNEL_MAP_2D(I8,  F16, F16, 1, SOURCE0),
+    PACK_KERNEL_MAP_2D(I16, F16, I16, 1, SOURCE0),
+    PACK_KERNEL_MAP_2D(I16, F16, F16, 1, SOURCE0),
+
+    PACK_KERNEL_MAP_2D(F16, F32, F16, 0, SOURCE1),
+    PACK_KERNEL_MAP_2D(F16, F32, I16, 0, SOURCE1),
+    PACK_KERNEL_MAP_2D(F16, F32, U8,  0, SOURCE1),
+    PACK_KERNEL_MAP_2D(F16, F32, I8,  0, SOURCE1),
+    PACK_KERNEL_MAP_2D(U8,  F32, U8,  0, SOURCE1),
+    PACK_KERNEL_MAP_2D(U8,  F32, F16, 0, SOURCE1),
+    PACK_KERNEL_MAP_2D(I8,  F32, I8,  0, SOURCE1),
+    PACK_KERNEL_MAP_2D(I8,  F32, F16, 0, SOURCE1),
+    PACK_KERNEL_MAP_2D(I16, F32, I16, 0, SOURCE1),
+    PACK_KERNEL_MAP_2D(I16, F32, F16, 0, SOURCE1),
+    PACK_KERNEL_MAP_2D(F16, F32, F16, 1, SOURCE1),
+    PACK_KERNEL_MAP_2D(F16, F32, I16, 1, SOURCE1),
+    PACK_KERNEL_MAP_2D(F16, F32, U8,  1, SOURCE1),
+    PACK_KERNEL_MAP_2D(F16, F32, I8,  1, SOURCE1),
+    PACK_KERNEL_MAP_2D(U8,  F32, U8,  1, SOURCE1),
+    PACK_KERNEL_MAP_2D(U8,  F32, F16, 1, SOURCE1),
+    PACK_KERNEL_MAP_2D(I8,  F32, I8,  1, SOURCE1),
+    PACK_KERNEL_MAP_2D(I8,  F32, F16, 1, SOURCE1),
+    PACK_KERNEL_MAP_2D(I16, F32, I16, 1, SOURCE1),
+    PACK_KERNEL_MAP_2D(I16, F32, F16, 1, SOURCE1),
 };
 
 /*
@@ -329,6 +374,7 @@ static vsi_status _query_kernel
 {
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e gamma_dtype;
     vsi_nn_kernel_dtype_e out_dtype;
     const _kernel_map_type * kernel_map = _batch_norm_kernel_map;
     size_t kernel_map_size              = _cnt_of_array( _batch_norm_kernel_map );
@@ -340,6 +386,7 @@ static vsi_status _query_kernel
     uint32_t brdcst = 0;
 
     in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    gamma_dtype  = vsi_nn_kernel_map_dtype( inputs[3]->attr.dtype.vx_type );
     out_dtype   = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
     if (inputs[BATCHNORM_INPUT]->attr.size[0] != 1 && inputs[BATCHNORM_INPUT_BETA]->attr.size[0] == 1)
@@ -347,7 +394,7 @@ static vsi_status _query_kernel
         brdcst = 1;
     }
 
-    key = BATCH_NORM_HASH_KEY(in_dtype, out_dtype, brdcst, image_2d);
+    key = BATCH_NORM_HASH_KEY(in_dtype, gamma_dtype, out_dtype, brdcst, image_2d);
 
     for( i = 0; i < kernel_map_size; i ++ )
     {
@@ -397,7 +444,6 @@ static vsi_nn_kernel_node_t _setup
     if ( (inputs[1]->attr.is_const && inputs[2]->attr.is_const)
         || (inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16)
         || (inputs[2]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16)
-        || (inputs[3]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16)
         || (inputs[4]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT32) )
     {
         return NULL;
diff --git a/src/tim/vx/internal/src/kernel/evis/cast_evis.c b/src/tim/vx/internal/src/kernel/evis/cast_evis.c
index 2d25883..4f201e9 100644
--- a/src/tim/vx/internal/src/kernel/evis/cast_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/cast_evis.c
@@ -241,6 +241,7 @@ static vsi_status _query_kernel
     uint32_t i;
 
     in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in_dtype  = in_dtype == BOOL8 ? I8 : in_dtype;
     out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
     key = CAST_HASH_KEY( in_dtype, out_dtype, image_2d );
diff --git a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c
index fd32271..3c1ac2f 100644
--- a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c
@@ -455,6 +455,7 @@ static vsi_status _query_kernel
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     input1_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+    output_dtype = output_dtype == I8 ? BOOL8 : output_dtype;
     key = HASH_COMPARISONS_KEY( operation, input0_dtype, input1_dtype, output_dtype, image_2d );
 
     for( i = 0; i < _cnt_of_array(_comparisons_evis_kernel_map); i ++ )
diff --git a/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c b/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c
new file mode 100644
index 0000000..923328e
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c
@@ -0,0 +1,702 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    NORMAL = 0,
+    K3_S1,
+    K3_S1_D2_D4,
+    K1024_SMALL,
+    K1024_LARGE,
+} _internal_kernel_e;
+
+#define _CONV1D_OVXLIB_KERNEL_SOURCE         "conv1d_ovxlib"
+#define _CONV1D_OVXLIB_KERNEL_SOURCE_K1024   "conv1d_ovxlib_k1024"
+
+#define STR(a) #a
+// Add kernel hashtable here
+#define CONV1D_OVXLIB_HASH_KEY( IN_DTYPE, W_DTYPE, B_DTYPE, OUT_DTYPE, KERNEL_TYPE ) \
+        (( KERNEL_TYPE << 24 ) | ( IN_DTYPE << 18 ) | ( W_DTYPE << 12 ) | ( B_DTYPE << 6 ) | ( OUT_DTYPE ))
+#define PACK_KERNEL_MAP( IN_DTYPE, W_DTYPE, B_DTYPE, OUT_DTYPE, KERNEL_TYPE, SOURCE ) \
+        { CONV1D_OVXLIB_HASH_KEY( IN_DTYPE, W_DTYPE, B_DTYPE, OUT_DTYPE, KERNEL_TYPE ), \
+         CVIVANTE_NAMESPACE(\
+         "evis.conv1d_"STR(IN_DTYPE)STR(W_DTYPE)STR(B_DTYPE)"to"STR(OUT_DTYPE)"_"STR(KERNEL_TYPE)), \
+         SOURCE }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _conv1d_ovxlib_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( U8, U8, I32, U8, K3_S1, _CONV1D_OVXLIB_KERNEL_SOURCE ),
+    PACK_KERNEL_MAP( U8, U8, I32, U8, K3_S1_D2_D4, _CONV1D_OVXLIB_KERNEL_SOURCE ),
+    PACK_KERNEL_MAP( U8, U8, I32, U8, K1024_SMALL, _CONV1D_OVXLIB_KERNEL_SOURCE_K1024 ),
+    PACK_KERNEL_MAP( U8, U8, I32, U8, K1024_LARGE, _CONV1D_OVXLIB_KERNEL_SOURCE_K1024 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _conv1d_ovxlib_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _CONV1D_OVXLIB_PARAM_NUM  _cnt_of_array( _conv1d_ovxlib_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_conv1d_ovxlib_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * input_attr    = NULL;
+    vsi_nn_kernel_tensor_attr_t * weights_attr  = NULL;
+    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
+    vsi_int_array_t * in_shape                  = NULL;
+    vsi_int_array_t * out_shape                 = NULL;
+    vsi_int_array_t * weight_shape              = NULL;
+    float             scaleIn         = 1.0f;
+    float             scaleOut        = 1.0f;
+    float             scaleWights     = 1.0f;
+    int32_t           input_ZP        = 0;
+    int32_t           weight_ZP       = 0;
+    float             output_ZP       = 0;
+    int32_t           stride          = 1;
+    int32_t           dilation        = 0;
+    int32_t           input_height    = 0;
+    int32_t           input_width     = 0;
+    int32_t           output_width    = 0;
+
+    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
+
+    weights_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( weights_attr, "Create tensor attr buffer fail.", final );
+
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &(stride));
+    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &(dilation));
+
+    in_shape     = input_attr->shape;
+    out_shape    = output_attr->shape;
+    weight_shape = weights_attr->shape;
+
+    if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
+    {
+        input_ZP          = input_attr->asymm.zero_point;
+        scaleIn           = input_attr->asymm.scale;
+    }
+
+    if ( VSI_NN_KERNEL_QUANT_ASYMM == weights_attr->quant )
+    {
+        weight_ZP         = weights_attr->asymm.zero_point;
+        scaleWights       = weights_attr->asymm.scale;
+    }
+
+    if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
+    {
+        output_ZP         = (float)output_attr->asymm.zero_point;
+        scaleOut          = output_attr->asymm.scale;
+    }
+
+    scaleOut     = (scaleIn * scaleWights) / scaleOut;
+    input_height = in_shape->data[1];
+    input_width  = in_shape->data[0];
+    output_width = out_shape->data[0];
+
+    if ((U8 == input_attr->dtype) && (U8 == weights_attr->dtype) && (U8 == output_attr->dtype))
+    {
+        gpu_dp_inst_t uniSumOrderUchar_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x0c080400, 0x0c080400, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        if ( (3 == weight_shape->data[0]) && (1 == stride) )
+        {
+            gpu_dp_inst_t uniConv1DK3_Lo0_4x4 = {{
+                0x69696969, // TCfg
+                0x44444444, // ASelt
+                0x41014000, 0x43034202, // ABin
+                0x55555555, // BSelt
+                0x55405540, 0x55405540, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniConv1DK3_Lo1_4x4 = {{
+                0x69696969, // TCfg
+                0x44444444, // ASelt
+                0x41114010, 0x43134212, // ABin
+                0x55555555, // BSelt
+                0x55415541, 0x55415541, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniConv1DK3_Lo2_4x4 = {{
+                0x69696969, // TCfg
+                0x44444444, // ASelt
+                0x41214020, 0x43234222, // ABin
+                0x55555555, // BSelt
+                0x55425542, 0x55425542, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniConv1DK3_Hi0_4x4 = {{
+                0x69696969, // TCfg
+                0x44444444, // ASelt
+                0x45054404, 0x47074606, // ABin
+                0x55555555, // BSelt
+                0x55405540, 0x55405540, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniConv1DK3_Hi1_4x4 = {{
+                0x69696969, // TCfg
+                0x44444444, // ASelt
+                0x45154414, 0x47174616, // ABin
+                0x55555555, // BSelt
+                0x55415541, 0x55415541, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniConv1DK3_Hi2_4x4 = {{
+                0x69696969, // TCfg
+                0x44444444, // ASelt
+                0x45254424, 0x47274626, // ABin
+                0x55555555, // BSelt
+                0x55425542, 0x55425542, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniDataConvK3_2x8 = {{
+                0x00111111, // TCfg
+                0x00110000, // ASelt
+                0x03020100, 0x00000504, // ABin
+                0x00222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+
+            uint32_t conv1dK3D2_Lo1[4] = {0x43134212, 0x45154414, 0x55415541, 0x55415541};
+            uint32_t conv1dK3D2_Lo2[4] = {0x45254424, 0x47274626, 0x55425542, 0x55425542};
+            uint32_t conv1dK3D2_Hi1[4] = {0x47174616, 0x49194818, 0x55415541, 0x55415541};
+            uint32_t conv1dK3D2_Hi2[4] = {0x49294828, 0x4b2b4a2a, 0x55425542, 0x55425542};
+            uint32_t conv1dK3D4_Lo1[4] = {0x45154414, 0x47174616, 0x55415541, 0x55415541};
+            uint32_t conv1dK3D4_Lo2[4] = {0x49294828, 0x4b2b4a2a, 0x55425542, 0x55425542};
+            uint32_t conv1dK3D4_Hi1[4] = {0x49194818, 0x4b1b4a1a, 0x55415541, 0x55415541};
+            uint32_t conv1dK3D4_Hi2[4] = {0x4d2d4c2c, 0x4f2f4e2e, 0x55425542, 0x55425542};
+
+            if (2 == dilation)
+            {
+                uniConv1DK3_Lo1_4x4.data[2] = conv1dK3D2_Lo1[0];
+                uniConv1DK3_Lo1_4x4.data[3] = conv1dK3D2_Lo1[1];
+                uniConv1DK3_Lo1_4x4.data[5] = conv1dK3D2_Lo1[2];
+                uniConv1DK3_Lo1_4x4.data[6] = conv1dK3D2_Lo1[3];
+                uniConv1DK3_Lo2_4x4.data[2] = conv1dK3D2_Lo2[0];
+                uniConv1DK3_Lo2_4x4.data[3] = conv1dK3D2_Lo2[1];
+                uniConv1DK3_Lo2_4x4.data[5] = conv1dK3D2_Lo2[2];
+                uniConv1DK3_Lo2_4x4.data[6] = conv1dK3D2_Lo2[3];
+                uniConv1DK3_Hi1_4x4.data[2] = conv1dK3D2_Hi1[0];
+                uniConv1DK3_Hi1_4x4.data[3] = conv1dK3D2_Hi1[1];
+                uniConv1DK3_Hi1_4x4.data[5] = conv1dK3D2_Hi1[2];
+                uniConv1DK3_Hi1_4x4.data[6] = conv1dK3D2_Hi1[3];
+                uniConv1DK3_Hi2_4x4.data[2] = conv1dK3D2_Hi2[0];
+                uniConv1DK3_Hi2_4x4.data[3] = conv1dK3D2_Hi2[1];
+                uniConv1DK3_Hi2_4x4.data[5] = conv1dK3D2_Hi2[2];
+                uniConv1DK3_Hi2_4x4.data[6] = conv1dK3D2_Hi2[3];
+            }
+            else if (4 == dilation)
+            {
+                uniConv1DK3_Lo1_4x4.data[2] = conv1dK3D4_Lo1[0];
+                uniConv1DK3_Lo1_4x4.data[3] = conv1dK3D4_Lo1[1];
+                uniConv1DK3_Lo1_4x4.data[5] = conv1dK3D4_Lo1[2];
+                uniConv1DK3_Lo1_4x4.data[6] = conv1dK3D4_Lo1[3];
+                uniConv1DK3_Lo2_4x4.data[2] = conv1dK3D4_Lo2[0];
+                uniConv1DK3_Lo2_4x4.data[3] = conv1dK3D4_Lo2[1];
+                uniConv1DK3_Lo2_4x4.data[5] = conv1dK3D4_Lo2[2];
+                uniConv1DK3_Lo2_4x4.data[6] = conv1dK3D4_Lo2[3];
+                uniConv1DK3_Hi1_4x4.data[2] = conv1dK3D4_Hi1[0];
+                uniConv1DK3_Hi1_4x4.data[3] = conv1dK3D4_Hi1[1];
+                uniConv1DK3_Hi1_4x4.data[5] = conv1dK3D4_Hi1[2];
+                uniConv1DK3_Hi1_4x4.data[6] = conv1dK3D4_Hi1[3];
+                uniConv1DK3_Hi2_4x4.data[2] = conv1dK3D4_Hi2[0];
+                uniConv1DK3_Hi2_4x4.data[3] = conv1dK3D4_Hi2[1];
+                uniConv1DK3_Hi2_4x4.data[5] = conv1dK3D4_Hi2[2];
+                uniConv1DK3_Hi2_4x4.data[6] = conv1dK3D4_Hi2[3];
+            }
+
+
+            status  = vsi_nn_kernel_gpu_add_param( node,
+                    "uniConv1DK3_Lo0_4x4", &uniConv1DK3_Lo0_4x4 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConv1DK3_Hi0_4x4", &uniConv1DK3_Hi0_4x4 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConv1DK3_Lo1_4x4", &uniConv1DK3_Lo1_4x4 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConv1DK3_Lo2_4x4", &uniConv1DK3_Lo2_4x4 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConv1DK3_Hi1_4x4", &uniConv1DK3_Hi1_4x4 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConv1DK3_Hi2_4x4", &uniConv1DK3_Hi2_4x4 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniDataConvK3_2x8", &uniDataConvK3_2x8 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumOrderUchar_2x8", &uniSumOrderUchar_2x8 );
+            status |= vsi_nn_kernel_gpu_add_param( node, "input_ZP", &input_ZP);
+            CHECK_STATUS_FAIL_GOTO(status, final );
+        }
+        else if ( (1024 == weight_shape->data[0]) && (1 == stride) )
+        {
+            gpu_dp_inst_t uniU8SubZp_lo_2x8= {{
+                0x99999999, // TCfg
+                0x44444444, // ASelt
+                0x03020100, 0x07060504, // ABin
+                0xaaaaaaaa, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00010001, 0x00010001, 0x00010001, 0x00010001,
+                0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniU8SubZp_hi_2x8= {{
+                0x99999999, // TCfg
+                0x44444444, // ASelt
+                0x0b0a0908, 0x0f0e0d0c, // ABin
+                0xaaaaaaaa, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00010001, 0x00010001, 0x00010001, 0x00010001,
+                0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniU8Conv1d_part0_8x2= {{
+                0x55555555, // TCfg
+                0x00000000, // ASelt
+                0x76543210, 0x87654321, // ABin
+                0x55555555, // BSelt
+                0x76543210, 0x76543210, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniU8Conv1d_part1_8x2= {{
+                0x55555555, // TCfg
+                0x00000000, // ASelt
+                0x98765432, 0xa9876543, // ABin
+                0x55555555, // BSelt
+                0x76543210, 0x76543210, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniU8Conv1d_part2_8x2= {{
+                0x55555555, // TCfg
+                0x00000000, // ASelt
+                0xba987654, 0xcba98765, // ABin
+                0x55555555, // BSelt
+                0x76543210, 0x76543210, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniU8Conv1d_part3_8x2= {{
+                0x55555555, // TCfg
+                0x00000000, // ASelt
+                0xdcba9876, 0xedcba987, // ABin
+                0x55555555, // BSelt
+                0x76543210, 0x76543210, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            int32_t kernel_cnt_x16 = (weight_shape->data[0] + 15) / 16;
+            status  = vsi_nn_kernel_gpu_add_param( node,
+                    "kernel_cnt_x16", &kernel_cnt_x16 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniU8SubZp_lo_2x8", &uniU8SubZp_lo_2x8 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniU8SubZp_hi_2x8", &uniU8SubZp_hi_2x8 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniU8Conv1d_part0_8x2", &uniU8Conv1d_part0_8x2 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniU8Conv1d_part1_8x2", &uniU8Conv1d_part1_8x2 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniU8Conv1d_part2_8x2", &uniU8Conv1d_part2_8x2 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniU8Conv1d_part3_8x2", &uniU8Conv1d_part3_8x2 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniSumOrderUchar_2x8", &uniSumOrderUchar_2x8 );
+            if (input_width >= GPU_TENSOR_MAX_WIDTH)
+            {
+                status |= vsi_nn_kernel_gpu_add_param( node, "input_width", &input_width);
+                status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &output_width);
+            }
+            CHECK_STATUS_FAIL_GOTO(status, final );
+        }
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "weight_ZP", &weight_ZP);
+        status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &output_ZP);
+        status |= vsi_nn_kernel_gpu_add_param( node, "scaleOut", &scaleOut);
+        status |= vsi_nn_kernel_gpu_add_param( node, "input_height", &input_height);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+
+    gpu_param.global_scale[0]  = 8;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.dim =  2;
+    gpu_param.global_size[0] = (
+            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0]);
+    gpu_param.global_size[1] = (
+            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(input_attr);
+
+    return status;
+} /* _conv1d_ovxlib_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    _internal_kernel_e   kernel_type
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e w_dtype;
+    vsi_nn_kernel_dtype_e b_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _conv1d_ovxlib_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _conv1d_ovxlib_kernel_map );
+    vx_param_description_t * param_def  = _conv1d_ovxlib_kernel_param_def;
+    size_t param_def_size               = _cnt_of_array( _conv1d_ovxlib_kernel_param_def );
+    vx_kernel_initialize_f  initializer = _conv1d_ovxlib_initializer;
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    w_dtype   = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
+    b_dtype   = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = CONV1D_OVXLIB_HASH_KEY( in_dtype, w_dtype, b_dtype, out_dtype, kernel_type );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_def_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+static vsi_nn_tensor_t* _create_new_bias_tensor
+    (
+    vsi_nn_graph_t  *graph,
+    vsi_nn_tensor_t *input,
+    vsi_nn_tensor_t *weight,
+    vsi_nn_tensor_t *bias
+    )
+{
+    vsi_nn_tensor_t * new_bias   = NULL;
+    vsi_nn_tensor_attr_t attr;
+    int32_t  *new_bias_data_ptr  = NULL;
+    uint8_t  *weight_data        = NULL;
+    int32_t  *bias_data          = NULL;
+    uint32_t  i, j;
+    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
+    weight_data = vsi_nn_ConvertTensorToData(graph, weight);
+
+    if (bias == NULL)
+    {
+        memcpy(&attr, &weight->attr, sizeof(vsi_nn_tensor_attr_t));
+        attr.dim_num = 2;
+        attr.size[0] = weight->attr.size[2];
+        attr.size[1] = 1;
+        if (weight->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
+        {
+            attr.dtype.scale = input->attr.dtype.scale * weight->attr.dtype.scale;
+            attr.dtype.zero_point = 0;
+            attr.dtype.vx_type = VSI_NN_TYPE_INT32;
+        }
+    }
+    else
+    {
+        memcpy(&attr, &bias->attr, sizeof(vsi_nn_tensor_attr_t));
+        if (attr.dim_num == 1)
+        {
+            attr.size[1]  = 1;
+            attr.dim_num  = 2;
+        }
+        bias_data = (int32_t *)vsi_nn_ConvertTensorToData(graph, bias);
+    }
+
+    new_bias_data_ptr = (int32_t *)malloc(attr.size[0] * sizeof(int32_t));
+    memset((void *)new_bias_data_ptr, 0, sizeof(int32_t) * attr.size[0]);
+
+    if (input->attr.dtype.zero_point != 0)
+    {
+        for (i = 0; i < weight->attr.size[2]; i++)
+        {
+            uint8_t *weight_ptr = weight_data + i * weight->attr.size[0] * weight->attr.size[1];
+            for (j = 0; j < weight->attr.size[0] * weight->attr.size[1]; j++)
+            {
+                 new_bias_data_ptr[i] += -((int32_t)weight_ptr[j] - weight->attr.dtype.zero_point) \
+                                         * input->attr.dtype.zero_point;
+            }
+        }
+    }
+
+    if (bias_data != NULL)
+    {
+        for (i = 0; i < attr.size[0]; i++)
+        {
+            new_bias_data_ptr[i] += bias_data[i];
+        }
+    }
+
+    new_bias = vsi_nn_CreateTensorFromData(graph, (uint8_t *)new_bias_data_ptr, &attr);
+
+    vsi_nn_safe_free( new_bias_data_ptr );
+    vsi_nn_safe_free( bias_data );
+    vsi_nn_safe_free( weight_data );
+
+    return new_bias;
+}
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CONV1D_OVXLIB_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t j = 0;
+    _internal_kernel_e      kernel_type = NORMAL;
+
+    int32_t stride          = vsi_nn_kernel_param_get_int32( params, "stride" );
+    int32_t pad_front       = vsi_nn_kernel_param_get_int32( params, "pad_front" );
+    int32_t pad_end         = vsi_nn_kernel_param_get_int32( params, "pad_end" );
+    int32_t dilation        = vsi_nn_kernel_param_get_int32( params, "dilation" );
+    int32_t overflow_policy = vsi_nn_kernel_param_get_int32( params, "overflow_policy" );
+    vsi_nn_tensor_t *in_tensors[3] = {NULL};
+    vsi_nn_tensor_t *new_bias = NULL;
+
+    if (VX_CONVERT_POLICY_SATURATE == overflow_policy)
+    {
+        overflow_policy = 1;
+    }
+    else
+    {
+        overflow_policy = 0;
+    }
+
+    if ( 1 == stride )
+    {
+        if ( 3 == inputs[1]->attr.size[0] )
+        {
+            if (2 == dilation || 4 == dilation)
+            {
+                 kernel_type = K3_S1_D2_D4;
+            }
+            else
+            {
+                kernel_type = K3_S1;
+            }
+        }
+        else if ( 1024 == inputs[1]->attr.size[0] )
+        {
+            if (inputs[0]->attr.size[0] < 65535)
+            {
+                kernel_type = K1024_SMALL;
+            }
+            else if (0 == pad_front && 0 == pad_end)
+            {
+                kernel_type = K1024_LARGE;
+            }
+            else
+            {
+                return NULL;
+            }
+        }
+        else
+        {
+            return NULL;
+        }
+    }
+
+    if (1024 == inputs[1]->attr.size[0])
+    {
+        new_bias = _create_new_bias_tensor(graph, inputs[0], inputs[1], inputs[2]);
+        in_tensors[0] = inputs[0];
+        in_tensors[1] = inputs[1];
+        in_tensors[2] = new_bias;
+    }
+    else
+    {
+        in_tensors[0] = inputs[0];
+        in_tensors[1] = inputs[1];
+        in_tensors[2] = inputs[2];
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, kernel_type );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            if( pad_front != 0 || pad_end != 0)
+            {
+                // Set default border mode.
+                vx_border_t border;
+                border.mode = VX_BORDER_CONSTANT;
+                border.constant_value.U8 = (uint8_t)(inputs[0]->attr.dtype.zero_point);
+                status |= vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
+            }
+
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _CONV1D_OVXLIB_PARAM_NUM,
+                    in_tensors, input_num, outputs, output_num );
+            j = (int32_t)(input_num + output_num);
+            node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &stride );
+            node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &pad_front );
+            node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &pad_end );
+            node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &dilation );
+            node_params[j++] = vsi_nn_kernel_scalar_create(graph, I32, &overflow_policy );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CONV1D_OVXLIB_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[--j] );
+            vsi_nn_kernel_scalar_release( &node_params[--j] );
+            vsi_nn_kernel_scalar_release( &node_params[--j] );
+            vsi_nn_kernel_scalar_release( &node_params[--j] );
+            vsi_nn_kernel_scalar_release( &node_params[--j] );
+        }
+    }
+
+    if (new_bias)
+    {
+        vsi_nn_ReleaseTensor(&new_bias);
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( conv1d_ovxlib, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c
index 9a57aee..7d3dc68 100644
--- a/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c
@@ -42,28 +42,44 @@ __BEGIN_DECLS
 /*
  * Define kernel meta.
  */
-#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOU8       CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toU8")
-#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOI8       CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toI8")
-#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOI16     CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toI16")
-#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOF16     CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toF16")
-#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOF16      CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toF16")
-#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOF16     CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toF16")
-#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI8      CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI8")
-#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI16     CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI16")
-#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOF16      CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toF16")
-#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOU8      CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toU8")
+#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOU8         CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toU8")
+#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOI8         CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toI8")
+#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOI16       CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toI16")
+#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOF16       CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toF16")
+#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOF16        CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toF16")
+#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOF16       CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toF16")
+#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI8        CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI8")
+#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI16       CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI16")
+#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOF16        CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toF16")
+#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOU8        CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toU8")
+
+#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOU8_BLK2    CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toU8_blk2")
+#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOI8_BLK2    CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toI8_blk2")
+#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOI16_BLK2  CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toI16_blk2")
+#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOF16_BLK2  CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toF16_blk2")
+#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I8TOF16_BLK2   CVIVANTE_NAMESPACE("evis.depth2space_crd_I8toF16_blk2")
+#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_I16TOF16_BLK2  CVIVANTE_NAMESPACE("evis.depth2space_crd_I16toF16_blk2")
+#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI8_BLK2   CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI8_blk2")
+#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOI16_BLK2  CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toI16_blk2")
+#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_U8TOF16_BLK2   CVIVANTE_NAMESPACE("evis.depth2space_crd_U8toF16_blk2")
+#define VX_KERNEL_NAME_DEPTH2SPACE_CRD_F16TOU8_BLK2   CVIVANTE_NAMESPACE("evis.depth2space_crd_F16toU8_blk2")
 
 #define KERNEL_SOURCE_1    "depth2space_crd"
 
 // Add kernel hashtable here
-#define HASH_DEPTH2SPACE_CRD_KEY(_input0_type, _output_type, _quant_type) \
-    ((_input0_type << 24) | (_output_type << 16) | (_quant_type << 8))
+#define HASH_DEPTH2SPACE_CRD_KEY(_input0_type, _output_type, _blk_size) \
+    ((_input0_type << 24) | (_output_type << 16) | (_blk_size << 8))
 
 #define TENSOR_DEPTH2SPACE_CRD_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
     { HASH_DEPTH2SPACE_CRD_KEY(IN0_TYPE, OUT_TYPE, 0), \
         VX_KERNEL_NAME_DEPTH2SPACE_CRD_##IN0_TYPE##TO##OUT_TYPE, \
         SOURCE },
 
+#define TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_DEPTH2SPACE_CRD_KEY(IN0_TYPE, OUT_TYPE, 1), \
+        VX_KERNEL_NAME_DEPTH2SPACE_CRD_##IN0_TYPE##TO##OUT_TYPE##_BLK2, \
+        SOURCE },
+
 static const struct {
         uint32_t key;
         char* function_name;
@@ -80,6 +96,17 @@ static const struct {
     TENSOR_DEPTH2SPACE_CRD_KERNELS(F16, I16,       KERNEL_SOURCE_1)
     TENSOR_DEPTH2SPACE_CRD_KERNELS(U8,  F16,       KERNEL_SOURCE_1)
     TENSOR_DEPTH2SPACE_CRD_KERNELS(F16, U8,        KERNEL_SOURCE_1)
+
+    TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(U8,  U8,   KERNEL_SOURCE_1)
+    TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(I8,  I8,   KERNEL_SOURCE_1)
+    TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(I16, I16,  KERNEL_SOURCE_1)
+    TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(F16, F16,  KERNEL_SOURCE_1)
+    TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(I8,  F16,  KERNEL_SOURCE_1)
+    TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(I16, F16,  KERNEL_SOURCE_1)
+    TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(F16, I8,   KERNEL_SOURCE_1)
+    TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(F16, I16,  KERNEL_SOURCE_1)
+    TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(U8,  F16,  KERNEL_SOURCE_1)
+    TENSOR_DEPTH2SPACE_CRD_BLK2_KERNELS(F16, U8,   KERNEL_SOURCE_1)
 };
 
 /*
@@ -118,9 +145,10 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
     int32_t     output_height = 0;
     int32_t     output_chn = 0;
     int32_t     src0ZP     = 0;
-    float       src0Scale  = 0;
+    float       src0Scale  = 1.0f;
     int32_t     dstZP      = 0;
-    float       dstScale   = 0;
+    float       dstScale   = 1.0f;
+    int32_t     block_size = 0;
 
     uint32_t pack_key = 0;
 
@@ -128,12 +156,15 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &block_size);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
 
-    src0ZP     = attr[0]->asymm.zero_point;
-    src0Scale  = attr[0]->asymm.scale;
-    dstZP      = attr[1]->asymm.zero_point;
-    dstScale   = attr[1]->asymm.scale;
-    if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    {
+        src0ZP     = attr[0]->asymm.zero_point;
+        src0Scale  = attr[0]->asymm.scale;
+    }
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
         if (attr[0]->dfp.fl > 0)
         {
@@ -143,27 +174,35 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
         {
             src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
         }
+        src0ZP = 0;
     }
-    else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
     {
         src0Scale = 1;
+        src0ZP = 0;
     }
 
-    if( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    {
+        dstZP      = attr[1]->asymm.zero_point;
+        dstScale   = attr[1]->asymm.scale;
+    }
+    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
         if (attr[1]->dfp.fl > 0)
         {
-            dstScale = (float)((int64_t)1 << attr[1]->dfp.fl);
+            dstScale = (1.0f / (float)((int64_t)1 << attr[1]->dfp.fl));
         }
         else
         {
-            dstScale = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl));
+            dstScale = (float)((int64_t)1 << -attr[1]->dfp.fl);
         }
-        dstScale = 1.0f/dstScale;
+        dstZP = 0;
     }
-    else if( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE )
+    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE )
     {
         dstScale = 1;
+        dstZP = 0;
     }
 
     output_dims = (uint32_t)attr[1]->shape->size;
@@ -179,6 +218,17 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
     shaderParam.global_size[1]   = output_height;
     shaderParam.global_size[2]   = output_chn;
 
+    if (block_size == 2)
+    {
+        shaderParam.global_scale[0]  = 16;
+        shaderParam.global_scale[1]  = 1;
+        shaderParam.global_scale[2]  = 1;
+        shaderParam.global_size[0]   = gpu_align_p2((output_width + shaderParam.global_scale[0] - 1)
+            / shaderParam.global_scale[0], 4);
+        shaderParam.global_size[1]   = output_height;
+        shaderParam.global_size[2]   = output_chn;
+    }
+
     status = vsi_nn_kernel_gpu_config( node, &shaderParam );
     CHECK_STATUS_FAIL_GOTO(status, OnError);
 
@@ -202,6 +252,43 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
                 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
 
+        gpu_dp_inst_t uniU8MulAndPostShift_ExLo_2x8 = {{
+            0xdddddddd, // TCfg
+            0x44444444, // ASelt
+            0x19111810, 0x1b131a12, // ABin
+            0x11111111, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00005600, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniU8MulAndPostShift_ExHi_2x8 = {{
+            0xdddddddd, // TCfg
+            0x44444444, // ASelt
+            0x1d151c14, 0x1f171e16, // ABin
+            0x11111111, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002600, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniDepth2SpaceF16Blk2_lo_2x8 = {{
+            0x11111111, // TCfg
+            0x10101010, // ASelt
+            0x01010000, 0x03030202, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniDepth2SpaceF16Blk2_hi_2x8 = {{
+            0x11111111, // TCfg
+            0x10101010, // ASelt
+            0x05050404, 0x07070606, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+
         switch( pack_key )
         {
         case _PACK_SELECT_KEY( U8, F16):
@@ -213,14 +300,25 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
         case _PACK_SELECT_KEY( U8, U8):
         case _PACK_SELECT_KEY( I8, I8):
         case _PACK_SELECT_KEY( I16, I16):
+        case _PACK_SELECT_KEY( F16, F16):
             {
                 gpu_quantize_multiplier_16bit( (double)src0Scale / dstScale, &M0, &postShift);
                 multAndoutZP0[0] = (uint32_t)(M0);
                 multAndoutZP0[1] = (uint32_t)((dstZP << postShift) - src0ZP * M0);
 
                 gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_0_Lo_2x8, postShift );
+                gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_ExLo_2x8, postShift );
+                gpu_dp_inst_update_postshfit( &uniU8MulAndPostShift_ExHi_2x8, postShift );
                 status = vsi_nn_kernel_gpu_add_param( node,
                     "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniU8MulAndPostShift_ExLo_2x8", &uniU8MulAndPostShift_ExLo_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniU8MulAndPostShift_ExHi_2x8", &uniU8MulAndPostShift_ExHi_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniDepth2SpaceF16Blk2_lo_2x8", &uniDepth2SpaceF16Blk2_lo_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniDepth2SpaceF16Blk2_hi_2x8", &uniDepth2SpaceF16Blk2_hi_2x8 );
                 status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP0", &multAndoutZP0 );
                 CHECK_STATUS_FAIL_GOTO(status, OnError );
             }
@@ -256,7 +354,8 @@ static vsi_status _query_kernel
     vsi_nn_tensor_t* const* const inputs,
     vsi_nn_tensor_t* const* const outputs,
     vsi_nn_kernel_t* kernel,
-    const vsi_nn_kernel_param_t * params
+    const vsi_nn_kernel_param_t * params,
+    int32_t blk_flg
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -268,16 +367,16 @@ static vsi_status _query_kernel
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    key = HASH_DEPTH2SPACE_CRD_KEY( input0_dtype, output_dtype, 0 );
+    key = HASH_DEPTH2SPACE_CRD_KEY( input0_dtype, output_dtype, blk_flg );
 
     for( i = 0; i < _cnt_of_array(depth2space_crd_map); i ++ )
     {
-        if( depth2space_crd_map[i].key == key )
+        if ( depth2space_crd_map[i].key == key )
         {
             break;
         }
     }
-    if( i < _cnt_of_array(depth2space_crd_map) )
+    if ( i < _cnt_of_array(depth2space_crd_map) )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  depth2space_crd_map[i].function_name );
         kernel->info.parameters = _depth2space_crd_kernel_param_def;
@@ -310,18 +409,19 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t tmp_params[_DEPTH2SPACE_CRD_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_t node = NULL;
     int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
+    int32_t blk_flg = block_size == 2 ? 1 : 0;
 
-    if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
         return NULL;
     }
 
-    status = _query_kernel( inputs, outputs, kernel, params );
-    if( VSI_SUCCESS == status)
+    status = _query_kernel( inputs, outputs, kernel, params, blk_flg);
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             vsi_nn_kernel_node_pack_io( tmp_params, _DEPTH2SPACE_CRD_PARAM_NUM, inputs, 1, outputs, 1 );
             tmp_params[2] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
diff --git a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
index 2c7c4f6..32b57b4 100644
--- a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
@@ -717,12 +717,13 @@ static vsi_nn_kernel_node_t _setup
     int32_t pad_front  = vsi_nn_kernel_param_get_int32( params, "pad_front" );
     int32_t pad_end  = vsi_nn_kernel_param_get_int32( params, "pad_end" );
     int32_t dilation   = vsi_nn_kernel_param_get_int32( params, "dilation" );
+    int32_t batch = inputs[0]->attr.size[2];
     _internal_kernel_size_e ks   = KN;
 
-    if (!((VSI_NN_TYPE_UINT8 == inputs[0]->attr.dtype.vx_type)
+    if ( (!((VSI_NN_TYPE_UINT8 == inputs[0]->attr.dtype.vx_type)
        && (VSI_NN_TYPE_UINT8 == inputs[1]->attr.dtype.vx_type)
        && (NULL == inputs[2] || VSI_NN_TYPE_INT32 == inputs[2]->attr.dtype.vx_type)
-       && (VSI_NN_TYPE_UINT8 == outputs[0]->attr.dtype.vx_type)))
+       && (VSI_NN_TYPE_UINT8 == outputs[0]->attr.dtype.vx_type))) || batch > 1)
     {
         return NULL;
     }
@@ -769,18 +770,27 @@ static vsi_nn_kernel_node_t _setup
 
     status = _query_kernel( kernel, temp_tensor, outputs, dilation, ks);
 
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
-            if( pad_front != 0 && pad_end != 0)
+            if ( pad_front != 0 && pad_end != 0)
             {
                 // Set default border mode.
                 vx_border_t border;
                 border.mode = VX_BORDER_CONSTANT;
-                border.constant_value.U8 = 0;
-                border.constant_value.U16 = 0;
+                if (VSI_NN_TYPE_UINT8 == inputs[0]->attr.dtype.vx_type &&
+                    VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC == inputs[0]->attr.dtype.qnt_type)
+                {
+                    border.constant_value.U8 = (uint8_t)inputs[0]->attr.dtype.zero_point;
+                }
+                else
+                {
+                    border.constant_value.U8 = 0;
+                    border.constant_value.U16 = 0;
+                }
+
                 status |= vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
             }
 
diff --git a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
index e6831f7..fd07a58 100644
--- a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
@@ -48,6 +48,7 @@ typedef enum
     UNARY_NEG,
     UNARY_HSIGMOID,
     UNARY_MISH,
+    UNARY_ROUND,
 } unary_type_e;
 
 /*
@@ -82,6 +83,7 @@ typedef enum
 #define NEG_OPERATION           neg
 #define HSIGMOID_OPERATION      hard_sigmoid
 #define MISH_OPERATION          mish
+#define ROUND_OPERATION         round
 
 static const struct {
         uint32_t key;
@@ -248,6 +250,30 @@ static const struct {
     TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8,   I8   , KERNEL_SOURCE_2D)
     TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, I8,   F16  , KERNEL_SOURCE_2D)
     TENSOR_UNARY_KERNELS_2D(MISH_OPERATION, UNARY_MISH, BF16, BF16 , KERNEL_SOURCE_2D)
+
+    TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, F16,  F16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, F16,  I16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, F16,  U8   , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, F16,  I8   , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, I16,  I16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, I16,  F16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, U8,   U8   , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, U8,   F16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, I8,   I8   , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, I8,   F16  , KERNEL_SOURCE_3D)
+    TENSOR_UNARY_KERNELS(ROUND_OPERATION, UNARY_ROUND, BF16, BF16 , KERNEL_SOURCE_3D)
+
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16,  F16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16,  I16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16,  U8   , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, F16,  I8   , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I16,  I16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I16,  F16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8,   U8   , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, U8,   F16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8,   I8   , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, I8,   F16  , KERNEL_SOURCE_2D)
+    TENSOR_UNARY_KERNELS_2D(ROUND_OPERATION, UNARY_ROUND, BF16, BF16 , KERNEL_SOURCE_2D)
 };
 
 #undef SIN_OPERATION
@@ -257,6 +283,7 @@ static const struct {
 #undef NEG_OPERATION
 #undef HSIGMOID_OPERATION
 #undef MISH_OPERATION
+#undef ROUND_OPERATION
 
 /*
  * Kernel params
@@ -375,6 +402,7 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
         case _PACK_SELECT_KEY( UNARY_NEG, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_HSIGMOID, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_MISH, BF16, BF16 ):
+        case _PACK_SELECT_KEY( UNARY_ROUND, BF16, BF16 ):
         {
             gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
                 0x11111111, // TCfg
@@ -653,6 +681,7 @@ REGISTER_ELTWISE_UNARY_BACKEND_EVIS( elu, UNARY_ELU )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( neg, UNARY_NEG )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_sigmoid, UNARY_HSIGMOID )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( mish, UNARY_MISH )
+REGISTER_ELTWISE_UNARY_BACKEND_EVIS( round, UNARY_ROUND )
 
 
 __END_DECLS
diff --git a/src/tim/vx/internal/src/kernel/evis/erf_evis.c b/src/tim/vx/internal/src/kernel/evis/erf_evis.c
new file mode 100644
index 0000000..7753349
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/erf_evis.c
@@ -0,0 +1,428 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define HASH_UNARY_KEY(_input_type, _output_type, _image_2d) \
+    ( (_input_type << 12) | (_output_type << 4) | (_image_2d))
+
+#define KERNEL_SOURCE    "erf",
+
+#define HASH_UNARY_SH_KERNEL_NAME( SRC_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.erf_"#SRC_TYPE"to"#DST_TYPE)
+
+#define TENSOR_UNARY_KERNELS(SRC_TYPE, OUT_TYPE) \
+    {   HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 0), \
+        HASH_UNARY_SH_KERNEL_NAME(SRC_TYPE, OUT_TYPE), \
+        KERNEL_SOURCE },
+
+#define HASH_UNARY_SH_KERNEL_2D_NAME(SRC_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.erf_"#SRC_TYPE"to"#DST_TYPE"_2D")
+
+#define TENSOR_UNARY_KERNELS_2D(SRC_TYPE, OUT_TYPE) \
+    {   HASH_UNARY_KEY(SRC_TYPE, OUT_TYPE, 1), \
+        HASH_UNARY_SH_KERNEL_2D_NAME(SRC_TYPE, OUT_TYPE), \
+        KERNEL_SOURCE },
+
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _erf_kernel_map[] =
+{
+    // Register kernel here
+    TENSOR_UNARY_KERNELS(F16,  F16 )
+    TENSOR_UNARY_KERNELS(F16,  I16 )
+    TENSOR_UNARY_KERNELS(F16,  U8  )
+    TENSOR_UNARY_KERNELS(F16,  I8  )
+    TENSOR_UNARY_KERNELS(I16,  I16 )
+    TENSOR_UNARY_KERNELS(I16,  F16 )
+    TENSOR_UNARY_KERNELS(U8,   U8  )
+    TENSOR_UNARY_KERNELS(U8,   F16 )
+    TENSOR_UNARY_KERNELS(I8,   I8  )
+    TENSOR_UNARY_KERNELS(I8,   F16 )
+    TENSOR_UNARY_KERNELS(BF16, BF16)
+
+    TENSOR_UNARY_KERNELS_2D(F16,  F16 )
+    TENSOR_UNARY_KERNELS_2D(F16,  I16 )
+    TENSOR_UNARY_KERNELS_2D(F16,  U8  )
+    TENSOR_UNARY_KERNELS_2D(F16,  I8  )
+    TENSOR_UNARY_KERNELS_2D(I16,  I16 )
+    TENSOR_UNARY_KERNELS_2D(I16,  F16 )
+    TENSOR_UNARY_KERNELS_2D(U8,   U8  )
+    TENSOR_UNARY_KERNELS_2D(U8,   F16 )
+    TENSOR_UNARY_KERNELS_2D(I8,   I8  )
+    TENSOR_UNARY_KERNELS_2D(I8,   F16 )
+    TENSOR_UNARY_KERNELS_2D(BF16, BF16)
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _erf_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _ERF_PARAM_NUM  _cnt_of_array( _erf_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_erf_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * attr[2]   = { NULL, NULL };
+    vsi_int_array_t * out_shape             = NULL;
+    float    inputScale                     = 1.0f;
+    float    inputTail                      = 0;
+    float    outputScale                    = 1.0f;
+    float    outputZP                       = 0;
+    uint32_t pack_key;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    out_shape  = attr[1]->shape;
+
+    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        int32_t fl = attr[0]->dfp.fl;
+        if (fl > 0)
+        {
+            inputScale = 1.0f / (float) ((int64_t)1 << fl);
+        }
+        else
+        {
+            inputScale = (float)((int64_t)1 << -fl);
+        }
+    }
+    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    {
+        inputScale  = attr[0]->asymm.scale;
+        inputTail = 0 - attr[0]->asymm.zero_point * inputScale;
+    }
+
+    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        int32_t fl = attr[1]->dfp.fl;
+        if (fl > 0)
+        {
+            outputScale = (float)((int64_t)1 << fl);
+        }
+        else
+        {
+            outputScale = (float)1.0f / (float) ((int64_t)1 << -fl);
+        }
+    }
+    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    {
+        outputScale = (float)1.0f / attr[1]->asymm.scale;
+        outputZP     = (float)attr[1]->asymm.zero_point;
+    }
+
+#define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE )    \
+        ( ( IN_TYPE << 16) | ( OUT_TYPE << 8))
+
+    pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype );
+
+    gpu_param.global_scale[0]  = 4;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.global_size[0] = gpu_align_p2(
+            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+
+    switch ( pack_key )
+    {
+        case _PACK_SELECT_KEY( BF16, BF16 ):
+        {
+            gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+                0x11111111, // TCfg
+                0x01010101, // ASelt
+                0x01050004, 0x03070206, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniExtractOddData_2x8 = {{
+                0x11111111, // TCfg
+                0x11110000, // ASelt
+                0x07050301, 0x07050301, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+            }, GPU_DP_TYPE_16 };
+
+            status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniExtractOddData_2x8", &uniExtractOddData_2x8 );
+            CHECK_STATUS_FAIL_GOTO(status, final );
+        }
+        break;
+    default:
+        {
+            gpu_dp_inst_t uniExtractHalf8_2x8 = {{
+                0x11111111, // TCfg
+                0x11110000, // ASelt
+                0x06040200, 0x06040200, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniExtractInteger_2x8 = {{
+                0x33333333, // TCfg
+                0x11110000, // ASelt
+                0x03020100, 0x03020100, // ABin
+                0x00000000, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+            gpu_dp_inst_t uniDatatoFp32Part0_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+                0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16 };
+
+            status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniDatatoFp32Part0_4x4", &uniDatatoFp32Part0_4x4 );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "inputScale", &inputScale );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "inputTail", &inputTail );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "outputScale", &outputScale );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "outputZP", &outputZP );
+
+            if (attr[1]->dtype == F16)
+            {
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                        "uniExtract8Data_2x8", &uniExtractHalf8_2x8 );
+            }
+            else
+            {
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                        "uniExtract8Data_2x8", &uniExtractInteger_2x8 );
+            }
+            CHECK_STATUS_FAIL_GOTO(status, final );
+        }
+        break;
+    }
+
+#undef _PACK_SELECT_KEY
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(attr[0]);
+    SAFE_FREE_TENSOR_ATTR(attr[1]);
+#undef SAFE_FREE_TENSOR_ATTR
+
+    return status;
+} /* _erf_initializer() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    vsi_bool image_2d
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _erf_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _erf_kernel_map );
+    vx_param_description_t * param_def  = _erf_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _erf_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = HASH_UNARY_KEY( in_dtype, out_dtype, image_2d );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _erf_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_ERF_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_tensor_t* rs_tensors[2] = { NULL };
+    int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+    int32_t new_rank = 0;
+    vsi_bool image_2d = FALSE;
+    vsi_bool ret = FALSE;
+
+    ret = vsi_nn_kernel_optimize_element_shape(
+            (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            shape, &new_rank );
+    if ( ret )
+    {
+        rs_tensors[0] = vsi_nn_reshape_tensor( graph,
+                inputs[0], (uint32_t*)shape, new_rank );
+        rs_tensors[1] = vsi_nn_reshape_tensor( graph,
+                outputs[0], (uint32_t*)shape, new_rank );
+    }
+
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[0]->attr.size,
+                rs_tensors[0]->attr.dim_num ) )
+    {
+        goto OnError;
+    }
+
+    image_2d = (rs_tensors[0]->attr.dim_num == 2 || rs_tensors[0]->attr.size[2] == 1);
+
+    status = _query_kernel( kernel, inputs, outputs, image_2d );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _ERF_PARAM_NUM,
+                    rs_tensors, 1, &rs_tensors[1], 1 );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _ERF_PARAM_NUM );
+        }
+    }
+
+OnError:
+    if (rs_tensors[0])
+    {
+        vsi_nn_ReleaseTensor( &rs_tensors[0] );
+    }
+
+    if (rs_tensors[1])
+    {
+        vsi_nn_ReleaseTensor( &rs_tensors[1] );
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( erf, _setup )
diff --git a/src/tim/vx/internal/src/kernel/evis/gather_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
index 2ef5977..ae35694 100644
--- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
@@ -64,39 +64,60 @@ __BEGIN_DECLS
 #define VX_KERNEL_NAME_GATHER_AXIS0_U8TOF16   CVIVANTE_NAMESPACE("evis.gather_U8toF16_axis0")
 #define VX_KERNEL_NAME_GATHER_AXIS0_F16TOU8   CVIVANTE_NAMESPACE("evis.gather_F16toU8_axis0")
 
+#define VX_KERNEL_NAME_GATHER_ARRAY_U8TOU8    CVIVANTE_NAMESPACE("evis.gather_U8toU8_array")
+#define VX_KERNEL_NAME_GATHER_ARRAY_I8TOI8    CVIVANTE_NAMESPACE("evis.gather_I8toI8_array")
+#define VX_KERNEL_NAME_GATHER_ARRAY_I16TOI16  CVIVANTE_NAMESPACE("evis.gather_I16toI16_array")
+#define VX_KERNEL_NAME_GATHER_ARRAY_F16TOF16  CVIVANTE_NAMESPACE("evis.gather_F16toF16_array")
+
+#define VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_U8TOU8    CVIVANTE_NAMESPACE("evis.gather_U8toU8_axis0_array")
+#define VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_I8TOI8    CVIVANTE_NAMESPACE("evis.gather_I8toI8_axis0_array")
+#define VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_I16TOI16  CVIVANTE_NAMESPACE("evis.gather_I16toI16_axis0_array")
+#define VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_F16TOF16  CVIVANTE_NAMESPACE("evis.gather_F16toF16_axis0_array")
+
 #define KERNEL_SOURCE_1    "gather"
 #define KERNEL_SOURCE_2    "gather_mix"
+#define KERNEL_SOURCE_3    "gather_array"
 
 // Add kernel hashtable here
-#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _is_axis0) \
-    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_is_axis0))
+#define HASH_GATHER_KEY(_input0_type, _input1_type, _output_type, _is_axis0, _is_max) \
+    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_is_axis0 << 4) | (_is_max))
 
 #define TENSOR_GATHER_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
-    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0), \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 0), \
         VX_KERNEL_NAME_GATHER_##IN0_TYPE##TO##OUT_TYPE, \
         SOURCE },
 
 #define TENSOR_GATHER_AXIS0_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
-    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1), \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 0), \
         VX_KERNEL_NAME_GATHER_AXIS0_##IN0_TYPE##TO##OUT_TYPE, \
         SOURCE },
 
+#define TENSOR_GATHER_ARRAY_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 0, 1), \
+        VX_KERNEL_NAME_GATHER_ARRAY_##IN0_TYPE##TO##OUT_TYPE, \
+        SOURCE },
+
+#define TENSOR_GATHER_AXIS0_ARRAY_KERNELS(IN0_TYPE, IN1TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GATHER_KEY(IN0_TYPE, IN1TYPE, OUT_TYPE, 1, 1), \
+        VX_KERNEL_NAME_GATHER_AXIS0_ARRAY_##IN0_TYPE##TO##OUT_TYPE, \
+        SOURCE },
+
 static const struct {
         uint32_t key;
         char* function_name;
         const char* source_name;
     } gather_map[] =
 {
-    TENSOR_GATHER_KERNELS(U8, I32,  U8,        KERNEL_SOURCE_1)
-    TENSOR_GATHER_KERNELS(I8, I32,  I8,        KERNEL_SOURCE_1)
-    TENSOR_GATHER_KERNELS(I16, I32, I16,       KERNEL_SOURCE_1)
-    TENSOR_GATHER_KERNELS(F16, I32, F16,       KERNEL_SOURCE_1)
-    TENSOR_GATHER_KERNELS(I8, I32,  F16,       KERNEL_SOURCE_2)
-    TENSOR_GATHER_KERNELS(I16, I32, F16,       KERNEL_SOURCE_2)
-    TENSOR_GATHER_KERNELS(F16, I32, I8,        KERNEL_SOURCE_2)
-    TENSOR_GATHER_KERNELS(F16, I32, I16,       KERNEL_SOURCE_2)
-    TENSOR_GATHER_KERNELS(U8, I32,  F16,       KERNEL_SOURCE_2)
-    TENSOR_GATHER_KERNELS(F16, I32, U8,        KERNEL_SOURCE_2)
+    TENSOR_GATHER_KERNELS(U8, I32,  U8,          KERNEL_SOURCE_1)
+    TENSOR_GATHER_KERNELS(I8, I32,  I8,          KERNEL_SOURCE_1)
+    TENSOR_GATHER_KERNELS(I16, I32, I16,         KERNEL_SOURCE_1)
+    TENSOR_GATHER_KERNELS(F16, I32, F16,         KERNEL_SOURCE_1)
+    TENSOR_GATHER_KERNELS(I8, I32,  F16,         KERNEL_SOURCE_2)
+    TENSOR_GATHER_KERNELS(I16, I32, F16,         KERNEL_SOURCE_2)
+    TENSOR_GATHER_KERNELS(F16, I32, I8,          KERNEL_SOURCE_2)
+    TENSOR_GATHER_KERNELS(F16, I32, I16,         KERNEL_SOURCE_2)
+    TENSOR_GATHER_KERNELS(U8, I32,  F16,         KERNEL_SOURCE_2)
+    TENSOR_GATHER_KERNELS(F16, I32, U8,          KERNEL_SOURCE_2)
     TENSOR_GATHER_AXIS0_KERNELS(U8, I32,  U8,    KERNEL_SOURCE_1)
     TENSOR_GATHER_AXIS0_KERNELS(I8, I32,  I8,    KERNEL_SOURCE_1)
     TENSOR_GATHER_AXIS0_KERNELS(I16, I32, I16,   KERNEL_SOURCE_1)
@@ -107,6 +128,14 @@ static const struct {
     TENSOR_GATHER_AXIS0_KERNELS(F16, I32, I16,   KERNEL_SOURCE_2)
     TENSOR_GATHER_AXIS0_KERNELS(U8, I32,  F16,   KERNEL_SOURCE_2)
     TENSOR_GATHER_AXIS0_KERNELS(F16, I32, U8,    KERNEL_SOURCE_2)
+    TENSOR_GATHER_ARRAY_KERNELS(U8, I32,  U8,    KERNEL_SOURCE_3)
+    TENSOR_GATHER_ARRAY_KERNELS(I8, I32,  I8,    KERNEL_SOURCE_3)
+    TENSOR_GATHER_ARRAY_KERNELS(I16, I32, I16,   KERNEL_SOURCE_3)
+    TENSOR_GATHER_ARRAY_KERNELS(F16, I32, F16,   KERNEL_SOURCE_3)
+    TENSOR_GATHER_AXIS0_ARRAY_KERNELS(U8, I32,  U8,    KERNEL_SOURCE_3)
+    TENSOR_GATHER_AXIS0_ARRAY_KERNELS(I8, I32,  I8,    KERNEL_SOURCE_3)
+    TENSOR_GATHER_AXIS0_ARRAY_KERNELS(I16, I32, I16,   KERNEL_SOURCE_3)
+    TENSOR_GATHER_AXIS0_ARRAY_KERNELS(F16, I32, F16,   KERNEL_SOURCE_3)
 };
 
 /*
@@ -129,7 +158,8 @@ static vsi_status get_gather_tensor_reshape_size
     vsi_nn_tensor_t ** inputs,
     int32_t sizes[VSI_NN_MAX_DIM_NUM],
     uint32_t block_size,
-    uint32_t idxFlg
+    uint32_t idxFlg,
+    int32_t* arrayFlg
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -157,12 +187,13 @@ static vsi_status get_gather_tensor_reshape_size
     }
     else
     {
-        if ((elementCnt / block_size) < VSI_NN_MAX_IMAGE_WIDTH)
+        sizes[0] = block_size;
+        sizes[1] = elementCnt / block_size;
+        if ((elementCnt / block_size) > VSI_NN_MAX_IMAGE_WIDTH)
         {
-            sizes[0] = block_size;
-            sizes[1] = elementCnt / block_size;
-            status = VSI_SUCCESS;
+            arrayFlg[0] = 1;
         }
+        status = VSI_SUCCESS;
     }
 #undef VSI_NN_MAX_IMAGE_WIDTH
 
@@ -535,10 +566,11 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
                 CHECK_STATUS_FAIL_GOTO(status, OnError );
             }
             break;
-        case _PACK_SELECT_KEY( I16, I16):
-        case _PACK_SELECT_KEY( I8,  I8):
-        case _PACK_SELECT_KEY( U8,  U8):
-        case _PACK_SELECT_KEY( F16, F16):
+        case _PACK_SELECT_KEY( I16,  I16):
+        case _PACK_SELECT_KEY( I8,   I8):
+        case _PACK_SELECT_KEY( U8,   U8):
+        case _PACK_SELECT_KEY( F16,  F16):
+        case _PACK_SELECT_KEY( BF16, BF16):
             {
                 status = vsi_nn_kernel_gpu_add_param( node,
                     "uniExtraCopyDpKeepinEvis_2x8", &uniExtraCopyDpKeepinEvis_2x8 );
@@ -583,7 +615,8 @@ static vsi_status _query_kernel
     vsi_nn_tensor_t* const* const outputs,
     vsi_nn_kernel_t* kernel,
     const vsi_nn_kernel_param_t * params,
-    int32_t axis
+    int32_t axis,
+    int32_t is_array
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -595,7 +628,16 @@ static vsi_status _query_kernel
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, axis );
+    if (input0_dtype == BF16)
+    {
+        input0_dtype = F16;
+    }
+    if (output_dtype == BF16)
+    {
+        output_dtype = F16;
+    }
+
+    key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, axis, is_array);
 
     for( i = 0; i < _cnt_of_array(gather_map); i ++ )
     {
@@ -640,6 +682,7 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_t             * kernel
     )
 {
+#define VSI_NN_MAX_BLOCK_SIZE  (65536)
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_node_param_t tmp_params[_GATHER_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_t node = NULL;
@@ -649,21 +692,23 @@ static vsi_nn_kernel_node_t _setup
     int32_t axis_num    = vsi_nn_kernel_param_get_int32( params, "axis_num" );
     int32_t axis        = vsi_nn_kernel_param_get_int32( params, "axis" );
     int32_t axis0_flg   = 0;
+    int32_t is_array    = block_size > VSI_NN_MAX_BLOCK_SIZE ? 1 : 0;
 
     if (axis == 0)
     {
-        status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, 0);
-        status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1);
-        status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], shapes[1][0], 0);
+        status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], axis_num, 0, &is_array);
+        status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1, &is_array);
+        status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], shapes[1][0], 0, &is_array);
         axis0_flg = 1;
     }
     else
     {
-        status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0);
-        status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1);
-        status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0);
+        status = get_gather_tensor_reshape_size(&inputs[0], shapes[0], block_size, 0, &is_array);
+        status |= get_gather_tensor_reshape_size(&inputs[1], shapes[1], 1, 1, &is_array);
+        status |= get_gather_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &is_array);
         axis0_flg = 0;
     }
+#undef VSI_NN_MAX_BLOCK_SIZE
     if (status != VSI_SUCCESS)
     {
         return NULL;
@@ -675,7 +720,7 @@ static vsi_nn_kernel_node_t _setup
         return NULL;
     }
 
-    status = _query_kernel( inputs, outputs, kernel, params, axis0_flg);
+    status = _query_kernel( inputs, outputs, kernel, params, axis0_flg, is_array);
     if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
diff --git a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
index c38b90e..8595e5a 100644
--- a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
@@ -387,6 +387,15 @@ static vsi_status _query_kernel
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+    if (input0_dtype == BF16)
+    {
+        input0_dtype = F16;
+    }
+    if (output_dtype == BF16)
+    {
+        output_dtype = F16;
+    }
+
     if(coord_dim == 1)
     {
         coord_type = _1D;
diff --git a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
new file mode 100644
index 0000000..89f0c4c
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
@@ -0,0 +1,1219 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+
+typedef enum
+{
+    INTERNAL_KERNEL_SUM_SQR,
+    INTERNAL_KERNEL_MEAN_VARI,
+    INTERNAL_KERNEL_NORM,
+} _internal_kernel_e;
+
+#define KERNEL_SOURCE_1    "group_normalization_i8"
+#define KERNEL_SOURCE_2    "group_normalization_u8"
+#define KERNEL_SOURCE_3    "group_normalization_i16"
+#define KERNEL_SOURCE_4    "group_normalization_f16"
+#define KERNEL_SOURCE_5    "group_normalization_u8_f16"
+
+#define HASH_GROUPNORM_SUM_SQR_SH_KERNEL_NAME(SRC0_TYPE) \
+    CVIVANTE_NAMESPACE("evis.group_norm_sumsqr_"#SRC0_TYPE)
+
+#define HASH_GROUPNORM_SUM_SQR_SH_KERNEL_2D_NAME(SRC0_TYPE) \
+    CVIVANTE_NAMESPACE("evis.group_norm_sumsqr_"#SRC0_TYPE"_2D")
+
+#define HASH_GROUPNORM_MEAN_VARI_SH_KERNEL_NAME \
+    CVIVANTE_NAMESPACE("evis.group_norm_meanvari")
+
+#define HASH_GROUPNORM_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.group_norm_"#SRC0_TYPE"to"#DST_TYPE)
+
+#define HASH_GROUPNORM_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.group_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D")
+
+// Add kernel hashtable here
+// Sum Sqr
+#define HASH_GROUPNORM_SUM_SQR_KEY(_input0_type, _output_type, _reshape_flag) \
+    ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
+
+#define TENSOR_GROUPNORM_SUM_SQR_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GROUPNORM_SUM_SQR_KEY(IN0_TYPE, OUT_TYPE, 0), \
+        HASH_GROUPNORM_SUM_SQR_SH_KERNEL_NAME(IN0_TYPE), \
+        SOURCE },
+
+#define TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GROUPNORM_SUM_SQR_KEY(IN0_TYPE, OUT_TYPE, 1), \
+        HASH_GROUPNORM_SUM_SQR_SH_KERNEL_2D_NAME(IN0_TYPE), \
+        SOURCE },
+
+#define HASH_GROUPNORM_MEAN_VARI_KEY(_input0_type, _output_type) \
+    ((_input0_type << 24) | (_output_type << 16))
+
+#define TENSOR_GROUPNORM_MEAN_VARI_KERNELS(SOURCE) \
+    { HASH_GROUPNORM_MEAN_VARI_KEY(F32, F32), \
+        HASH_GROUPNORM_MEAN_VARI_SH_KERNEL_NAME, \
+        SOURCE },
+
+// normalization
+#define HASH_GROUPNORM_KEY(_input0_type, _output_type, _reshape_flag) \
+    ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
+
+#define TENSOR_GROUPNORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GROUPNORM_KEY(IN0_TYPE, OUT_TYPE, 0), \
+        HASH_GROUPNORM_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
+#define TENSOR_GROUPNORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GROUPNORM_KEY(IN0_TYPE, OUT_TYPE, 1), \
+        HASH_GROUPNORM_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _groupnorm_sum_sqr_kernel_map[] =
+{
+    // Register kernel here
+    TENSOR_GROUPNORM_SUM_SQR_KERNELS( I8, F32, KERNEL_SOURCE_1 )
+    TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( I8, F32, KERNEL_SOURCE_1 )
+    TENSOR_GROUPNORM_SUM_SQR_KERNELS( U8, F32, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( U8, F32, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SUM_SQR_KERNELS( I16, F32, KERNEL_SOURCE_3 )
+    TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( I16, F32, KERNEL_SOURCE_3 )
+    TENSOR_GROUPNORM_SUM_SQR_KERNELS( F16, F32, KERNEL_SOURCE_4 )
+    TENSOR_GROUPNORM_SUM_SQR_KERNELS_2D( F16, F32, KERNEL_SOURCE_4 )
+};
+
+static const _kernel_map_type _groupnorm_mean_vari_kernel_map[] =
+{
+    // Register kernel here
+    TENSOR_GROUPNORM_MEAN_VARI_KERNELS( KERNEL_SOURCE_2 )
+};
+
+static const _kernel_map_type _groupnorm_kernel_map[] =
+{
+    // Register kernel here
+    TENSOR_GROUPNORM_KERNELS( I8, I8, KERNEL_SOURCE_1 )
+    TENSOR_GROUPNORM_KERNELS_2D( I8, I8, KERNEL_SOURCE_1 )
+    TENSOR_GROUPNORM_KERNELS( I8, F16, KERNEL_SOURCE_1 )
+    TENSOR_GROUPNORM_KERNELS_2D( I8, F16, KERNEL_SOURCE_1 )
+
+    TENSOR_GROUPNORM_KERNELS( U8, U8, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_KERNELS_2D( U8, U8, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_KERNELS( U8, F16, KERNEL_SOURCE_5 )
+    TENSOR_GROUPNORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_5 )
+
+    TENSOR_GROUPNORM_KERNELS( I16, I16, KERNEL_SOURCE_3 )
+    TENSOR_GROUPNORM_KERNELS_2D( I16, I16, KERNEL_SOURCE_3 )
+    TENSOR_GROUPNORM_KERNELS( I16, F16, KERNEL_SOURCE_3 )
+    TENSOR_GROUPNORM_KERNELS_2D( I16, F16, KERNEL_SOURCE_3 )
+
+    TENSOR_GROUPNORM_KERNELS( F16, F16, KERNEL_SOURCE_4 )
+    TENSOR_GROUPNORM_KERNELS_2D( F16, F16, KERNEL_SOURCE_4 )
+    TENSOR_GROUPNORM_KERNELS( F16, U8, KERNEL_SOURCE_4 )
+    TENSOR_GROUPNORM_KERNELS_2D( F16, U8, KERNEL_SOURCE_4 )
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _groupnorm_sum_sqr_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _GROUPNORM_SUM_SQR_PARAM_NUM  _cnt_of_array( _groupnorm_sum_sqr_kernel_param_def )
+
+static vx_param_description_t _groupnorm_mean_vari_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _GROUPNORM_MEAN_VARI_PARAM_NUM  _cnt_of_array( _groupnorm_mean_vari_kernel_param_def )
+
+static vx_param_description_t _groupnorm_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _GROUPNORM_PARAM_NUM  _cnt_of_array( _groupnorm_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        3,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
+    vsi_int_array_t * input_shape = NULL;
+    float scaleIn = 1;
+    int32_t input_zp = 0;
+    vx_uint32 iter = 0;
+    int32_t sumInZp = 0;
+    int32_t tmpZp1 = 0;
+    float tmpZp2 = 0;
+    float e2InScale = 0;
+    float rowSumScale = 0;
+    int32_t is2D = 0;
+    int32_t width = 0;
+    int32_t height = 0;
+    int32_t chn = 0;
+    float in_scale_fl = 1, inFlScale_s2 = 1;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &is2D);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    input_shape  = attr[0]->shape;
+
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        input_zp     = attr[0]->asymm.zero_point;
+        scaleIn      = attr[0]->asymm.scale;
+    }
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    {
+        if (attr[0]->dfp.fl > 0)
+        {
+            in_scale_fl = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
+        }
+        else
+        {
+            in_scale_fl = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
+        }
+        inFlScale_s2 = in_scale_fl * in_scale_fl;
+    }
+
+    width = input_shape->data[0];
+    height = input_shape->data[1];
+    chn = attr[1]->shape->data[1];
+    if (is2D)
+    {
+        height = 1;
+    }
+    iter = height * 16;
+
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        sumInZp = input_zp * iter * (-1);
+        tmpZp1 = (-2) * input_zp;
+        e2InScale = scaleIn * scaleIn;
+        tmpZp2 = input_zp * input_zp * e2InScale;
+        rowSumScale = height * 16 * tmpZp2;
+    }
+
+    shaderParam.global_scale[0]  = 1;
+    shaderParam.global_scale[1]  = 1;
+    shaderParam.global_scale[2]  = 1;
+    shaderParam.local_size[0]  = 16;
+    shaderParam.local_size[1]  = 1;
+    shaderParam.local_size[2]  = 1;
+
+    if (attr[0]->dtype == I8 || attr[0]->dtype == U8)
+    {
+        shaderParam.global_size[0]   = (width + 255) / 256 * 16;
+    }
+    else if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    {
+        shaderParam.global_size[0]   = (width + 127) / 128 * 16;
+    }
+    shaderParam.global_size[1]   = chn;
+    shaderParam.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    if (attr[0]->dtype == U8)
+    {
+        gpu_dp_inst_t uniSumU8_16x1 = {{
+            0x55555555, // TCfg
+            0x00000000, // ASelt
+            0x76543210, 0xfedcba98, // ABin
+            0xaaaaaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniSqrSum_16x1 = {{
+            0x55555555, // TCfg
+            0x00000000, // ASelt
+            0x76543210, 0xfedcba98, // ABin
+            0x55555555, // BSelt
+            0x76543210, 0xfedcba98, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        status  = vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1);
+        status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp);
+        status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1);
+        status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
+        status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
+        status |= vsi_nn_kernel_gpu_add_param(node, "rowSumScale", &rowSumScale);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+    }
+    else if (attr[0]->dtype == I8)
+    {
+        gpu_dp_inst_t uniSumInt8_16x1 = {{
+            0x55555555, // TCfg
+            0x00000000, // ASelt
+            0x76543210, 0xfedcba98, // ABin
+            0xaaaaaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniSqrSumInt8_16x1 = {{
+            0x55555555, // TCfg
+            0x00000000, // ASelt
+            0x76543210, 0xfedcba98, // ABin
+            0x55555555, // BSelt
+            0x76543210, 0xfedcba98, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        status  = vsi_nn_kernel_gpu_add_param(node, "uniSumInt8_16x1", &uniSumInt8_16x1);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSumInt8_16x1", &uniSqrSumInt8_16x1);
+        status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl);
+        status |= vsi_nn_kernel_gpu_add_param(node, "inFlScale_s2", &inFlScale_s2);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+    }
+    else if (attr[0]->dtype == I16)
+    {
+        gpu_dp_inst_t uniInt16SumSqr_dp8x2 = {{
+            0x55555555, // TCfg
+            0x00000000, // ASelt
+            0x76543210, 0x76543210, // ABin
+            0x5555aaaa, // BSelt
+            0x00000000, 0x76543210, // BBin
+            0x00000300, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        status  = vsi_nn_kernel_gpu_add_param(node, "uniInt16SumSqr_dp8x2", &uniInt16SumSqr_dp8x2);
+        status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl);
+        status |= vsi_nn_kernel_gpu_add_param(node, "inFlScale_s2", &inFlScale_s2);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+    }
+    else if (attr[0]->dtype == F16)
+    {
+        gpu_dp_inst_t uniFp16SumSqr_dp8x2 = {{
+            0x55555555, // TCfg
+            0x00000000, // ASelt
+            0x76543210, 0x76543210, // ABin
+            0x5555aaaa, // BSelt
+            0x00000000, 0x76543210, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x3c003c00, 0x3c003c00, 0x3c003c00, 0x3c003c00, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+    }
+
+    status = vsi_nn_kernel_gpu_add_param(node, "width", &width);
+    status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+
+    return status;
+}
+
+DEF_KERNEL_INITIALIZER(_groupnorm_mean_vari_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        3,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    vsi_nn_kernel_tensor_attr_t* attr[1] = {NULL};
+    int32_t chn = 0;
+    int32_t group_stride = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+
+    chn = attr[0]->shape->data[1];
+    group_stride = attr[0]->shape->data[0];
+
+    shaderParam.global_scale[0]  = 4;
+    shaderParam.global_scale[1]  = 1;
+    shaderParam.global_scale[2]  = 1;
+    shaderParam.local_size[0]    = 16;
+    shaderParam.local_size[1]    = 1;
+    shaderParam.local_size[2]    = 1;
+    shaderParam.global_size[0]   = 16;
+    shaderParam.global_size[1]   = chn;
+    shaderParam.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        gpu_dp_inst_t uniResetFp32_4x4 = {{
+            0x09090909, // TCfg
+                0x00000000, // ASelt
+                0x00110000, 0x00330022, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000700, // AccumType, ConstantType, and PostShift
+                0x00010001, 0x00000000, 0x00010001, 0x00000000,
+                0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        status  = vsi_nn_kernel_gpu_add_param(node, "uniResetFp32_4x4", &uniResetFp32_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "group_stride", &group_stride);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+
+    return status;
+}
+
+DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        3,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
+    vsi_int_array_t * input_shape = NULL;
+    float scaleIn = 1.0f;
+    float scaleOut = 1.0f;
+    float reScaleOut_u8 = 1.0f;
+    float scale_inOut = 1.0f;
+    int32_t output_zp = 0;
+    int32_t input_zp = 0;
+    float in_scale_fl = 1, out_scale_fl = 1, inOut_fl_scale = 1;
+    int32_t height = 0, width = 0, chn = 0;
+    int32_t is2D = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] );
+    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &is2D);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    input_shape  = attr[0]->shape;
+
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        input_zp     = attr[0]->asymm.zero_point;
+        scaleIn      = attr[0]->asymm.scale;
+    }
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    {
+        if (attr[0]->dfp.fl > 0)
+        {
+            in_scale_fl = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
+        }
+        else
+        {
+            in_scale_fl = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
+        }
+        input_zp = 0;
+    }
+
+    if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        output_zp    = attr[2]->asymm.zero_point;
+        scaleOut     = attr[2]->asymm.scale;
+        reScaleOut_u8 = 1 / scaleOut;
+    }
+    else if (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    {
+        if (attr[2]->dfp.fl > 0)
+        {
+            out_scale_fl = (float)((int64_t)1 << attr[2]->dfp.fl);
+        }
+        else
+        {
+            out_scale_fl = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
+        }
+        output_zp = 0;
+    }
+
+    if ((attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
+        && (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP))
+    {
+        inOut_fl_scale = in_scale_fl * out_scale_fl;
+    }
+
+    width = input_shape->data[0];
+    height = input_shape->data[1];
+    chn = attr[1]->shape->data[1];
+    if (is2D)
+    {
+        height = 1;
+    }
+
+    shaderParam.global_scale[0]  = 16;
+    if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    {
+        shaderParam.global_scale[0]  = 8;
+    }
+
+    shaderParam.global_scale[1]  = 1;
+    shaderParam.global_scale[2]  = 1;
+    shaderParam.global_size[0]   = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
+                                        / shaderParam.global_scale[0], 4);
+    shaderParam.global_size[1]   = (height + shaderParam.global_scale[1] - 1)
+        / shaderParam.global_scale[1];
+    shaderParam.global_size[2]   = chn;
+    if (is2D)
+    {
+        shaderParam.global_size[0]   = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
+            / shaderParam.global_scale[0], 4);
+        shaderParam.global_size[1]   = (chn + shaderParam.global_scale[1] - 1)
+            / shaderParam.global_scale[1];
+        shaderParam.global_size[2]   = 1;
+    }
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        gpu_dp_inst_t UniFP16toFP32Lo4_dp4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertEndInt16Fp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
+            0x05050505, // TCfg
+            0x04040404, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvert2ndUint8SubZpToFp32_4x4 = {{
+            0x05050505, // TCfg
+            0x04040404, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvert3rdUint8SubZpToFp32_4x4 = {{
+            0x05050505, // TCfg
+            0x04040404, // ASelt
+            0x00090008, 0x000b000a, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvert4thUint8SubZpToFp32_4x4 = {{
+            0x05050505, // TCfg
+            0x04040404, // ASelt
+            0x000d000c, 0x000f000e, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertInt16Fp32Fst_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertInt16Fp32Secd_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertInt32toInt16_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertDirUint8Fp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertEndUint8Fp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertTrdUint8Fp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00090008, 0x000b000a, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertFthUint8Fp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x000d000c, 0x000f000e, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertHalfToFp16_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        uint32_t pack_key      = 0;
+#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE )    \
+        (IN0_TYPE | (OUT_TYPE << 8))
+
+        pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype );
+
+        status  = vsi_nn_kernel_gpu_add_param(node, "height", &height);
+        status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+        switch( pack_key )
+        {
+            case _PACK_SELECT_KEY( I8, I8 ):
+            case _PACK_SELECT_KEY( I8, F16 ):
+                {
+                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
+                        &uniConvertInt32toUint8_2x8);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertDirInt8Fp32_4x4",
+                        &uniConvertDirUint8Fp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertEndInt8Fp32_4x4",
+                        &uniConvertEndUint8Fp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertTrdInt8Fp32_4x4",
+                        &uniConvertTrdUint8Fp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFthInt8Fp32_4x4",
+                        &uniConvertFthUint8Fp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
+                        &uniConvertHalfToFp16_2x8);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl);
+
+                    status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &out_scale_fl);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &inOut_fl_scale);
+                    CHECK_STATUS_FAIL_GOTO(status, OnError );
+                }
+                break;
+            case _PACK_SELECT_KEY( U8, U8 ):
+                {
+                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
+                        &uniConvertInt32toUint8_2x8);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
+                        &uniConvert1stUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
+                        &uniConvert2ndUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4",
+                        &uniConvert3rdUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
+                        &uniConvert4thUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
+
+                    status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_zp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &reScaleOut_u8);
+
+                    scale_inOut = reScaleOut_u8 * scaleIn;
+                    status |= vsi_nn_kernel_gpu_add_param(node, "scale_inOut", &scale_inOut);
+                    CHECK_STATUS_FAIL_GOTO(status, OnError );
+                }
+                break;
+            case _PACK_SELECT_KEY( U8, F16 ):
+                {
+                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
+                        &uniConvert1stUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
+                        &uniConvert2ndUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4",
+                        &uniConvert3rdUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
+                        &uniConvert4thUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
+                        &uniConvertHalfToFp16_2x8);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
+                    CHECK_STATUS_FAIL_GOTO(status, OnError );
+                }
+                break;
+            case _PACK_SELECT_KEY( I16, I16 ):
+            case _PACK_SELECT_KEY( I16, F16 ):
+                {
+                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Fst_4x4",
+                        &uniConvertInt16Fp32Fst_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Secd_4x4",
+                        &uniConvertInt16Fp32Secd_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl);
+
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toInt16_2x8",
+                         &uniConvertInt32toInt16_2x8);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
+                        &uniConvertHalfToFp16_2x8);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &out_scale_fl);
+
+                    status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &inOut_fl_scale);
+                    CHECK_STATUS_FAIL_GOTO(status, OnError );
+                }
+                break;
+            case _PACK_SELECT_KEY( F16, F16 ):
+                {
+                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertEndInt16Fp32_4x4",
+                        &uniConvertEndInt16Fp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
+                        &uniConvertHalfToFp16_2x8);
+                    CHECK_STATUS_FAIL_GOTO(status, OnError );
+                }
+                break;
+            case _PACK_SELECT_KEY( F16, U8 ):
+                {
+                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertEndInt16Fp32_4x4",
+                        &uniConvertEndInt16Fp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
+                        &uniConvertInt32toUint8_2x8);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_zp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &reScaleOut_u8);
+                    CHECK_STATUS_FAIL_GOTO(status, OnError );
+                }
+                break;
+            default:
+                VSI_ASSERT( FALSE );
+                return VSI_FAILURE;
+        }
+#undef _PACK_SELECT_KEY
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+    if (attr[2])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[2] );
+        attr[2] = NULL;
+    }
+
+    return status;
+}
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    const uint32_t hashkey,
+    _internal_kernel_e kernel_id
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vx_kernel_initialize_f  initializer = NULL;
+    vx_param_description_t * param_def = NULL;
+    const _kernel_map_type* kernel_map;
+    size_t kernel_map_size = 0;
+    size_t param_size = 0;
+    uint32_t i = 0;
+
+    switch( kernel_id )
+    {
+        case INTERNAL_KERNEL_SUM_SQR:
+            initializer = _groupnorm_sum_sqr_initializer;
+            kernel_map = _groupnorm_sum_sqr_kernel_map;
+            kernel_map_size = _cnt_of_array( _groupnorm_sum_sqr_kernel_map );
+            param_def = _groupnorm_sum_sqr_kernel_param_def;
+            param_size = _GROUPNORM_SUM_SQR_PARAM_NUM;
+            break;
+        case INTERNAL_KERNEL_MEAN_VARI:
+            initializer = _groupnorm_mean_vari_initializer;
+            kernel_map = _groupnorm_mean_vari_kernel_map;
+            kernel_map_size = _cnt_of_array( _groupnorm_mean_vari_kernel_map );
+            param_def = _groupnorm_mean_vari_kernel_param_def;
+            param_size = _GROUPNORM_MEAN_VARI_PARAM_NUM;
+            break;
+        case INTERNAL_KERNEL_NORM:
+            initializer = _groupnorm_initializer;
+            kernel_map = _groupnorm_kernel_map;
+            kernel_map_size = _cnt_of_array( _groupnorm_kernel_map );
+            param_def = _groupnorm_kernel_param_def;
+            param_size = _GROUPNORM_PARAM_NUM;
+            break;
+        default:
+            VSI_ASSERT( FALSE );
+            return VSI_FAILURE;
+    }
+
+    for( i = 0; i < kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == hashkey )
+        {
+            break;
+        }
+    }
+    if ( i < kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+static int32_t _optimize_gn_shape
+    (
+    vsi_nn_tensor_t ** inputs,
+    int32_t group_size,
+    int32_t group_num,
+    int32_t* opt_shape,
+    int32_t* is2D_flg
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    int32_t group_shape[VSI_NN_MAX_DIM_NUM] = {0};
+    int32_t new_rank = 0;
+    group_shape[0] = inputs[0]->attr.size[0];
+    group_shape[1] = inputs[0]->attr.size[1];
+    group_shape[2] = group_size;
+
+    vsi_nn_kernel_optimize_element_shape( group_shape, 3, opt_shape, &new_rank );
+
+    if (opt_shape[1] == 1)
+    {
+        opt_shape[1] = group_num;
+        opt_shape[2] = 1;
+        opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
+        is2D_flg[0] = 1;
+    }
+    else if (new_rank == 2)
+    {
+        opt_shape[2] = group_num;
+        opt_shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
+    }
+    else
+    {
+        status = VSI_FAILURE;
+    }
+
+    return status;
+}
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+#define INTERNAL_KERNEL_SIZE    (2)
+#define SUM_SQR_INDEX           (0)
+#define MEAN_VARI_INDEX         (1)
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t sum_sqr_node_params[_GROUPNORM_SUM_SQR_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_param_t mean_vari_node_params[_GROUPNORM_MEAN_VARI_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_param_t node_params[_GROUPNORM_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_t tmp_node = NULL, tmp_node1 = NULL;
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_kernel_dtype_e in0_dtype = U8;
+    vsi_nn_kernel_dtype_e out_dtype = U8;
+    vsi_nn_tensor_attr_t attr;
+    vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL };
+    vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL };
+    vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
+    int32_t new_shape[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 };
+    int32_t is2D_flg = 0;
+    uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 };
+    uint32_t hashkey = 0;
+    int32_t i = 0;
+    float rSpaceOrg = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1]);
+    float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
+    int32_t group_num  = vsi_nn_kernel_param_get_int32( params, "group_num" );
+    int32_t group_size  = inputs[0]->attr.size[2] / group_num;
+    float group_ratio = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] * group_size);
+
+    // Check if gpu can support the size
+    if ( !vsi_nn_kernel_gpu_check_shape(
+        (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _optimize_gn_shape(inputs, group_size, group_num, new_shape, &is2D_flg);
+    if ( VSI_SUCCESS != status )
+    {
+        goto final;
+    }
+    rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape, 4);
+    rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape, 4);
+
+    for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
+    {
+        ikernels[i] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
+        // Assign unique_id
+        ikernels[i]->unique_id = kernel->unique_id;
+    }
+
+    in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    hashkeys[SUM_SQR_INDEX]= HASH_GROUPNORM_SUM_SQR_KEY( in0_dtype, F32, is2D_flg );
+    hashkeys[MEAN_VARI_INDEX]= HASH_GROUPNORM_MEAN_VARI_KEY( F32, F32 );
+    hashkey = HASH_GROUPNORM_KEY( in0_dtype, out_dtype, is2D_flg );
+
+    status = _query_kernel( ikernels[SUM_SQR_INDEX], hashkeys[SUM_SQR_INDEX], INTERNAL_KERNEL_SUM_SQR );
+    if ( VSI_SUCCESS != status )
+    {
+        goto final;
+    }
+    status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI );
+    if ( VSI_SUCCESS != status )
+    {
+        goto final;
+    }
+    status = _query_kernel( kernel, hashkey, INTERNAL_KERNEL_NORM );
+    if ( VSI_SUCCESS != status )
+    {
+        goto final;
+    }
+
+    memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
+    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+    attr.is_const = FALSE;
+    attr.vtl = TRUE;
+    attr.size[0] = ((new_shape[0] + 255) / 256) * 4;
+    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
+        || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16)
+    {
+        attr.size[0] = ((new_shape[0] + 127) / 128) * 4;
+    }
+    attr.size[1] = group_num;
+    attr.size[2] = 1;
+    attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
+    attr.dim_num = 4;
+    tensors[SUM_SQR_INDEX] = vsi_nn_CreateTensor( graph, &attr );
+
+    attr.size[0] = 4;
+    tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr );
+
+    // Sum Sqr
+    tmp_node = vsi_nn_kernel_create_node( graph, ikernels[SUM_SQR_INDEX] );
+    if (tmp_node)
+    {
+        uint32_t index = 0;
+        sum_sqr_node_params[index++] = rs_input;
+        sum_sqr_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUM_SQR_INDEX]->t;
+        sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
+        sum_sqr_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is2D_flg );
+
+        status  = vsi_nn_kernel_node_pass_param( tmp_node, sum_sqr_node_params,
+            _GROUPNORM_SUM_SQR_PARAM_NUM );
+        CHECK_STATUS(status);
+        vsi_nn_kernel_scalar_release( &sum_sqr_node_params[2] );
+        vsi_nn_kernel_scalar_release( &sum_sqr_node_params[3] );
+        {
+            // Set default border mode.
+            vx_border_t border;
+            border.mode = VX_BORDER_CONSTANT;
+            border.constant_value.U8 = 0;
+            border.constant_value.U16 = 0;
+            if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
+            {
+                border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+            }
+            status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) );
+            CHECK_STATUS(status);
+        }
+    }
+
+    // mean vari
+    tmp_node1 = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] );
+    if (tmp_node1)
+    {
+        uint32_t index = 0;
+        mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[SUM_SQR_INDEX]->t;
+        mean_vari_node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
+        mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
+        mean_vari_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &group_ratio );
+
+        status  = vsi_nn_kernel_node_pass_param( tmp_node1, mean_vari_node_params,
+            _GROUPNORM_MEAN_VARI_PARAM_NUM );
+        CHECK_STATUS(status);
+        vsi_nn_kernel_scalar_release( &mean_vari_node_params[2] );
+        vsi_nn_kernel_scalar_release( &mean_vari_node_params[3] );
+        {
+            // Set default border mode.
+            vx_border_t border;
+            border.mode = VX_BORDER_CONSTANT;
+            border.constant_value.U8 = 0;
+            border.constant_value.U16 = 0;
+            if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
+            {
+                border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+            }
+            status = vxSetNodeAttribute( (vx_node)tmp_node1, VX_NODE_BORDER, &border, sizeof(border) );
+            CHECK_STATUS(status);
+        }
+    }
+
+    // Nomalization
+    node = vsi_nn_kernel_create_node( graph, kernel );
+    if (node)
+    {
+        uint32_t index = 0;
+        int32_t  pStride = 0;
+        if (!is2D_flg)
+        {
+            pStride = inputs[1]->attr.size[0] / new_shape[1];
+            rSpaceOrg = 1.0f / (new_shape[0] / pStride);
+        }
+        node_params[index++] = rs_input;
+        node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
+        node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
+        node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
+        node_params[index++] = rs_output;
+        node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
+        node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is2D_flg );
+        node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rSpaceOrg );
+        node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pStride );
+
+        status  = vsi_nn_kernel_node_pass_param( node, node_params,
+            _GROUPNORM_PARAM_NUM );
+        CHECK_STATUS(status);
+        vsi_nn_kernel_scalar_release( &node_params[5] );
+        vsi_nn_kernel_scalar_release( &node_params[6] );
+        vsi_nn_kernel_scalar_release( &node_params[7] );
+        vsi_nn_kernel_scalar_release( &node_params[8] );
+        {
+            // Set default border mode.
+            vx_border_t border;
+            border.mode = VX_BORDER_CONSTANT;
+            border.constant_value.U8 = 0;
+            border.constant_value.U16 = 0;
+            if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
+            {
+                border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+            }
+            status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
+            CHECK_STATUS(status);
+        }
+    }
+
+    /* Pass parameters to node. */
+final:
+    if (rs_input)
+    {
+        vsi_nn_kernel_tensor_release( &rs_input );
+    }
+    if (rs_output)
+    {
+        vsi_nn_kernel_tensor_release( &rs_output );
+    }
+    for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
+    {
+        if ( ikernels[i] )
+        {
+            vsi_nn_kernel_release( &ikernels[i] );
+        }
+        if ( tensors[i] )
+        {
+            vsi_nn_ReleaseTensor( &tensors[i] );
+        }
+    }
+    if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );}
+    if (tmp_node1) {vsi_nn_kernel_node_release( &tmp_node1 );}
+#undef INTERNAL_KERNEL_SIZE
+#undef SUM_SQR_INDEX
+#undef MEAN_VARI_INDEX
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( group_norm, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
index b893e74..ecb7014 100644
--- a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
@@ -53,6 +53,10 @@ typedef enum
 #define KERNEL_SOURCE_2    "instance_normalization_u8"
 #define KERNEL_SOURCE_3    "instance_normalization_i16"
 #define KERNEL_SOURCE_4    "instance_normalization_f16"
+#define KERNEL_SOURCE_5    "instance_normalization_u8_f16"
+#define KERNEL_SOURCE_6    "instance_normalization_scale_f32"
+#define KERNEL_SOURCE_7    "instance_normalization_scale_f32_f16"
+#define KERNEL_SOURCE_8    "instance_normalization_scale_f32_bf16"
 
 #define HASH_INSTANCENORM_MEAN_VARI_SH_KERNEL_NAME(SRC0_TYPE) \
     CVIVANTE_NAMESPACE("evis.instance_norm_meanvari_"#SRC0_TYPE)
@@ -66,6 +70,12 @@ typedef enum
 #define HASH_INSTANCENORM_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \
     CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D")
 
+#define HASH_INSTANCENORM_SCALE_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"F32to"#DST_TYPE)
+
+#define HASH_INSTANCENORM_SCALE_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"F32to"#DST_TYPE"_2D")
+
 // Add kernel hashtable here
 // mean vari
 #define HASH_INSTANCENORM_MEAN_VARI_KEY(_input0_type, _output_type, _reshape_flag) \
@@ -82,19 +92,29 @@ typedef enum
         SOURCE },
 
 // normalization
-#define HASH_INSTANCENORM_KEY(_input0_type, _output_type, _reshape_flag) \
-    ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
+#define HASH_INSTANCENORM_KEY(_input0_type, _input1_type, _output_type, _reshape_flag) \
+    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_reshape_flag << 4))
 
 #define TENSOR_INSTANCENORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_INSTANCENORM_KEY(IN0_TYPE, OUT_TYPE, 0), \
+    { HASH_INSTANCENORM_KEY(IN0_TYPE, F16, OUT_TYPE, 0), \
         HASH_INSTANCENORM_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
         SOURCE },
 
 #define TENSOR_INSTANCENORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_INSTANCENORM_KEY(IN0_TYPE, OUT_TYPE, 1), \
+    { HASH_INSTANCENORM_KEY(IN0_TYPE, F16, OUT_TYPE, 1), \
         HASH_INSTANCENORM_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
         SOURCE },
 
+#define TENSOR_INSTANCENORM_SCALE_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_INSTANCENORM_KEY(IN0_TYPE, F32, OUT_TYPE, 0), \
+        HASH_INSTANCENORM_SCALE_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
+#define TENSOR_INSTANCENORM_SCALE_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_INSTANCENORM_KEY(IN0_TYPE, F32, OUT_TYPE, 1), \
+        HASH_INSTANCENORM_SCALE_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
 typedef struct
 {
     uint32_t key;
@@ -113,6 +133,8 @@ static const _kernel_map_type _instancenorm_mean_vari_kernel_map[] =
     TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( I16, F32, KERNEL_SOURCE_3 )
     TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( F16, F32, KERNEL_SOURCE_4 )
     TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( F16, F32, KERNEL_SOURCE_4 )
+    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS( BF16, F32, KERNEL_SOURCE_8 )
+    TENSOR_INSTANCENORM_MEAN_VARI_KERNELS_2D( BF16, F32, KERNEL_SOURCE_8 )
 };
 
 static const _kernel_map_type _instancenorm_kernel_map[] =
@@ -125,8 +147,8 @@ static const _kernel_map_type _instancenorm_kernel_map[] =
 
     TENSOR_INSTANCENORM_KERNELS( U8, U8, KERNEL_SOURCE_2 )
     TENSOR_INSTANCENORM_KERNELS_2D( U8, U8, KERNEL_SOURCE_2 )
-    TENSOR_INSTANCENORM_KERNELS( U8, F16, KERNEL_SOURCE_2 )
-    TENSOR_INSTANCENORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_2 )
+    TENSOR_INSTANCENORM_KERNELS( U8, F16, KERNEL_SOURCE_5 )
+    TENSOR_INSTANCENORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_5 )
 
     TENSOR_INSTANCENORM_KERNELS( I16, I16, KERNEL_SOURCE_3 )
     TENSOR_INSTANCENORM_KERNELS_2D( I16, I16, KERNEL_SOURCE_3 )
@@ -135,6 +157,21 @@ static const _kernel_map_type _instancenorm_kernel_map[] =
 
     TENSOR_INSTANCENORM_KERNELS( F16, F16, KERNEL_SOURCE_4 )
     TENSOR_INSTANCENORM_KERNELS_2D( F16, F16, KERNEL_SOURCE_4 )
+
+    TENSOR_INSTANCENORM_SCALE_KERNELS( U8, U8, KERNEL_SOURCE_6 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_2D( U8, U8, KERNEL_SOURCE_6 )
+
+    TENSOR_INSTANCENORM_SCALE_KERNELS( I8, I8, KERNEL_SOURCE_6 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_2D( I8, I8, KERNEL_SOURCE_6 )
+
+    TENSOR_INSTANCENORM_SCALE_KERNELS( I16, I16, KERNEL_SOURCE_6 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_2D( I16, I16, KERNEL_SOURCE_6 )
+
+    TENSOR_INSTANCENORM_SCALE_KERNELS( F16, F16, KERNEL_SOURCE_7 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_2D( F16, F16, KERNEL_SOURCE_7 )
+
+    TENSOR_INSTANCENORM_SCALE_KERNELS( BF16, BF16, KERNEL_SOURCE_8 )
+    TENSOR_INSTANCENORM_SCALE_KERNELS_2D( BF16, BF16, KERNEL_SOURCE_8 )
 };
 
 /*
@@ -254,7 +291,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
     {
         shaderParam.global_size[0]   = (width + 255) / 256 * 16;
     }
-    else if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    else if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == BF16)
     {
         shaderParam.global_size[0]   = (width + 127) / 128 * 16;
     }
@@ -350,6 +387,32 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer)
         status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2", &uniFp16SumSqr_dp8x2);
         CHECK_STATUS_FAIL_GOTO(status, OnError );
     }
+    else if (attr[0]->dtype == BF16)
+    {
+        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+                0x11111111, // TCfg
+                0x01010101, // ASelt
+                0x01050004, 0x03070206, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
+                0x11111111, // TCfg
+                0x01010101, // ASelt
+                0x05050404, 0x07070606, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+        status = vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+    }
 
     status = vsi_nn_kernel_gpu_add_param(node, "width", &width);
     status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
@@ -385,15 +448,13 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
         {0, 0, 0},  // localWorkSize: local group size in thread
         {0, 0, 0}}; // globalWorkSize: image size in thread
 
-    vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
+    vsi_nn_kernel_tensor_attr_t* attr[4] = {NULL, NULL};
     vsi_int_array_t * input_shape = NULL;
     float scaleIn = 1.0f;
     float scaleOut = 1.0f;
-    float reScaleOut_u8 = 1.0f;
     float scale_inOut = 1.0f;
     int32_t output_zp = 0;
     int32_t input_zp = 0;
-    float in_scale_fl = 1, out_scale_fl = 1, inOut_fl_scale = 1;
     float dimRatio = 0;
     vx_uint32 group_num = 0;
     vx_int32 height = 0, width = 0, chn = 0;
@@ -401,10 +462,12 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
 
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
     CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
     CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
+    attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] );
+    CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", OnError );
 
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &rsFlg);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
@@ -420,43 +483,39 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
     {
         if (attr[0]->dfp.fl > 0)
         {
-            in_scale_fl = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
+            scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
         }
         else
         {
-            in_scale_fl = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
+            scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
         }
         input_zp = 0;
     }
 
-    if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    if (attr[3]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
     {
-        output_zp    = attr[2]->asymm.zero_point;
-        scaleOut     = attr[2]->asymm.scale;
-        reScaleOut_u8 = 1 / scaleOut;
+        output_zp    = attr[3]->asymm.zero_point;
+        scaleOut     = attr[3]->asymm.scale;
+        scaleOut     = 1 / scaleOut;
     }
-    else if (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    else if (attr[3]->quant == VSI_NN_KERNEL_QUANT_DFP)
     {
-        if (attr[2]->dfp.fl > 0)
+        if (attr[3]->dfp.fl > 0)
         {
-            out_scale_fl = (float)((int64_t)1 << attr[2]->dfp.fl);
+            scaleOut = (float)((int64_t)1 << attr[3]->dfp.fl);
         }
         else
         {
-            out_scale_fl = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
+            scaleOut = (1.0f / (float)((int64_t)1 << -attr[3]->dfp.fl));
         }
         output_zp = 0;
     }
 
-    if ((attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
-        && (attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP))
-    {
-        inOut_fl_scale = in_scale_fl * out_scale_fl;
-    }
+    scale_inOut = scaleIn * scaleOut;
 
     width = input_shape->data[0];
     height = input_shape->data[1];
-    chn = attr[1]->shape->data[1];
+    chn = attr[2]->shape->data[1];
     if (rsFlg)
     {
         height = height / chn;
@@ -467,7 +526,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
     group_num = (width + 255) / 256;
 
     shaderParam.global_scale[0]  = 16;
-    if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == BF16)
     {
         shaderParam.global_scale[0]  = 8;
         group_num = (width + 127) / 128;
@@ -630,23 +689,52 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
             0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
         }, GPU_DP_TYPE_16 };
 
-        uint32_t pack_key      = 0;
-#define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE )    \
-        (IN0_TYPE | (OUT_TYPE << 8))
+        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x01050004, 0x03070206, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x05050404, 0x07070606, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniExtractOddData_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x07050301, 0x07050301, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
 
-        pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype );
+        uint32_t pack_key      = 0;
+#define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE )    \
+        (IN0_TYPE | (IN1_TYPE << 8) | (OUT_TYPE << 16))
+
+        pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[1]->dtype, attr[3]->dtype );
 
         status  = vsi_nn_kernel_gpu_add_param(node, "height", &height);
         status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio);
         status |= vsi_nn_kernel_gpu_add_param(node, "group_num", &group_num);
-        status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4);
-        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8", &uniConvertHalfToFp16_2x8);
         CHECK_STATUS_FAIL_GOTO(status, OnError );
 
         switch( pack_key )
         {
-            case _PACK_SELECT_KEY( I8, I8 ):
-            case _PACK_SELECT_KEY( I8, F16 ):
+            case _PACK_SELECT_KEY( I8, F16, I8 ):
+            case _PACK_SELECT_KEY( I8, F16, F16 ):
                 {
                     status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
                         &uniConvertInt32toUint8_2x8);
@@ -658,15 +746,42 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
                         &uniConvertTrdUint8Fp32_4x4);
                     status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertFthInt8Fp32_4x4",
                         &uniConvertFthUint8Fp32_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
+                        &uniConvertHalfToFp16_2x8);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
+                        &UniFP16toFP32Lo4_dp4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &scaleIn);
 
-                    status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &out_scale_fl);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &inOut_fl_scale);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &scaleOut);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &scale_inOut);
                     CHECK_STATUS_FAIL_GOTO(status, OnError );
                 }
                 break;
-            case _PACK_SELECT_KEY( U8, U8 ):
-            case _PACK_SELECT_KEY( U8, F16 ):
+            case _PACK_SELECT_KEY( U8, F16, U8 ):
+                {
+                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
+                        &uniConvertInt32toUint8_2x8);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
+                        &uniConvert1stUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
+                        &uniConvert2ndUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4",
+                        &uniConvert3rdUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
+                        &uniConvert4thUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
+                        &UniFP16toFP32Lo4_dp4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
+
+                    status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_zp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "scale_inOut", &scale_inOut);
+                    CHECK_STATUS_FAIL_GOTO(status, OnError );
+                }
+                break;
+            case _PACK_SELECT_KEY( U8, F32, U8 ):
+            case _PACK_SELECT_KEY( I8, F32, I8 ):
                 {
                     status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
                         &uniConvertInt32toUint8_2x8);
@@ -679,37 +794,85 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
                     status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
                         &uniConvert4thUint8SubZpToFp32_4x4);
                     status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
 
                     status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_zp);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &reScaleOut_u8);
-
-                    scale_inOut = reScaleOut_u8 * scaleIn;
+                    status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut);
                     status |= vsi_nn_kernel_gpu_add_param(node, "scale_inOut", &scale_inOut);
                     CHECK_STATUS_FAIL_GOTO(status, OnError );
                 }
                 break;
-            case _PACK_SELECT_KEY( I16, I16 ):
-            case _PACK_SELECT_KEY( I16, F16 ):
+            case _PACK_SELECT_KEY( U8, F16, F16 ):
+                {
+                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
+                        &uniConvert1stUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
+                        &uniConvert2ndUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4",
+                        &uniConvert3rdUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
+                        &uniConvert4thUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
+                        &uniConvertHalfToFp16_2x8);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
+                        &UniFP16toFP32Lo4_dp4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
+                    CHECK_STATUS_FAIL_GOTO(status, OnError );
+                }
+                break;
+            case _PACK_SELECT_KEY( I16, F16, I16 ):
+            case _PACK_SELECT_KEY( I16, F16, F16 ):
                 {
                     status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Fst_4x4",
                         &uniConvertInt16Fp32Fst_4x4);
                     status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Secd_4x4",
                         &uniConvertInt16Fp32Secd_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &in_scale_fl);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "input_fl_scale", &scaleIn);
 
                     status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toInt16_2x8",
                          &uniConvertInt32toInt16_2x8);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &out_scale_fl);
-
-                    status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &inOut_fl_scale);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
+                        &uniConvertHalfToFp16_2x8);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
+                        &UniFP16toFP32Lo4_dp4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &scaleOut);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &scale_inOut);
                     CHECK_STATUS_FAIL_GOTO(status, OnError );
                 }
                 break;
-            case _PACK_SELECT_KEY( F16, F16 ):
+            case _PACK_SELECT_KEY( I16, F32, I16 ):
+                {
+                    status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Fst_4x4",
+                        &uniConvertInt16Fp32Fst_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16Fp32Secd_4x4",
+                        &uniConvertInt16Fp32Secd_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toInt16_2x8",
+                         &uniConvertInt32toInt16_2x8);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "output_fl_scale", &scaleOut);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "inOut_fl_scale", &scale_inOut);
+                    CHECK_STATUS_FAIL_GOTO(status, OnError );
+                }
+                break;
+            case _PACK_SELECT_KEY( F16, F16, F16 ):
+            case _PACK_SELECT_KEY( F16, F32, F16 ):
                 {
                     status = vsi_nn_kernel_gpu_add_param(node, "uniConvertEndInt16Fp32_4x4",
                         &uniConvertEndInt16Fp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertHalfToFp16_2x8",
+                        &uniConvertHalfToFp16_2x8);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
+                        &UniFP16toFP32Lo4_dp4x4);
+                    CHECK_STATUS_FAIL_GOTO(status, OnError );
+                }
+                break;
+            case _PACK_SELECT_KEY( BF16, F32, BF16 ):
+                {
+                    status  = vsi_nn_kernel_gpu_add_param( node,
+                                "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                                "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                                "uniExtractOddData_2x8", &uniExtractOddData_2x8 );
                     CHECK_STATUS_FAIL_GOTO(status, OnError );
                 }
                 break;
@@ -736,6 +899,11 @@ OnError:
         vsi_nn_kernel_tensor_attr_release( &attr[2] );
         attr[2] = NULL;
     }
+    if (attr[3])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[3] );
+        attr[3] = NULL;
+    }
 
     return status;
 }
@@ -826,11 +994,13 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_t tmp_node = NULL;
     vsi_nn_kernel_node_t node = NULL;
     vsi_nn_kernel_dtype_e in0_dtype = U8;
+    vsi_nn_kernel_dtype_e in1_dtype = F16;
     vsi_nn_kernel_dtype_e out_dtype = U8;
     vsi_nn_tensor_attr_t attr;
     vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL };
     vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL };
     vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL, rs_gamma = NULL, rs_beta = NULL;
+    int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
     uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 };
     uint32_t hashkey = 0;
     int32_t i = 0;
@@ -851,29 +1021,12 @@ static vsi_nn_kernel_node_t _setup
         ikernels[i]->unique_id = kernel->unique_id;
     }
 
-    memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
-    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
-    attr.is_const = FALSE;
-    attr.vtl = TRUE;
-
-    attr.size[0] = ((inputs[0]->attr.size[0] + 255) / 256) * 4;
-
-    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
-        || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16)
-    {
-        attr.size[0] = ((inputs[0]->attr.size[0] + 127) / 128) * 4;
-    }
-    attr.size[1] = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
-    attr.size[2] = 1;
-    attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
-    attr.dim_num = 4;
-    tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr );
-
     in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
     out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
     hashkeys[MEAN_VARI_INDEX]= HASH_INSTANCENORM_MEAN_VARI_KEY( in0_dtype, F32, reshape_flg );
-    hashkey = HASH_INSTANCENORM_KEY( in0_dtype, out_dtype, reshape_flg );
+    hashkey = HASH_INSTANCENORM_KEY( in0_dtype, in1_dtype, out_dtype, reshape_flg );
 
     status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_MEAN_VARI );
     if ( VSI_SUCCESS != status )
@@ -888,22 +1041,54 @@ static vsi_nn_kernel_node_t _setup
 
     if (reshape_flg)
     {
-        int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
         shape[0] = inputs[0]->attr.size[0];
         shape[1] = inputs[0]->attr.size[1] * inputs[0]->attr.size[2];
         shape[2] = 1;
         shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
         rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 );
-
-        shape[0] = outputs[0]->attr.size[0];
-        shape[1] = outputs[0]->attr.size[1] * outputs[0]->attr.size[2];
-        shape[2] = 1;
-        shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1;
         rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 );
     }
+    else if (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] < GPU_TENSOR_MAX_WIDTH)
+    {
+        shape[0] = inputs[0]->attr.size[0] * inputs[0]->attr.size[1];
+        shape[1] = 1;
+        shape[2] = inputs[0]->attr.size[2];
+        shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
+        rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 );
+        rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 );
+    }
+    else if (inputs[0]->attr.size[0] < inputs[0]->attr.size[1])
+    {
+        shape[0] = inputs[0]->attr.size[1];
+        shape[1] = inputs[0]->attr.size[0];
+        shape[2] = inputs[0]->attr.size[2];
+        shape[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
+        rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, 4 );
+        rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, 4 );
+    }
+    else
+    {
+        shape[0] = inputs[0]->attr.size[0];
+    }
+
+    memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
+    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+    attr.is_const = FALSE;
+    attr.vtl = TRUE;
+    attr.size[0] = ((shape[0] + 255) / 256) * 4;
+    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
+        || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16)
+    {
+        attr.size[0] = ((shape[0] + 127) / 128) * 4;
+    }
+    attr.size[1] = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
+    attr.size[2] = 1;
+    attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
+    attr.dim_num = 4;
+    tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr );
+
     if (inputs[1]->attr.dim_num < 2)
     {
-        int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
         shape[0] = inputs[1]->attr.size[0];
         shape[1] = 1;
         shape[2] = 1;
@@ -912,7 +1097,6 @@ static vsi_nn_kernel_node_t _setup
     }
     if (inputs[2]->attr.dim_num < 2)
     {
-        int32_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
         shape[0] = inputs[2]->attr.size[0];
         shape[1] = 1;
         shape[2] = 1;
@@ -925,7 +1109,7 @@ static vsi_nn_kernel_node_t _setup
         if (tmp_node)
         {
             uint32_t index = 0;
-            if (reshape_flg)
+            if (rs_input)
             {
                 mean_vari_node_params[index++] = rs_input;
                 vsi_nn_kernel_node_pack_io( &mean_vari_node_params[index],
@@ -967,7 +1151,7 @@ static vsi_nn_kernel_node_t _setup
         if (node)
         {
             uint32_t index = 0;
-            if (reshape_flg)
+            if (rs_input)
             {
                 node_params[index++] = rs_input;
             }
@@ -992,7 +1176,7 @@ static vsi_nn_kernel_node_t _setup
                 node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[2]->t;
             }
             node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
-            if (reshape_flg)
+            if (rs_output)
             {
                 node_params[index++] = rs_output;
             }
@@ -1034,9 +1218,12 @@ final:
     {
         vsi_nn_kernel_tensor_release( &rs_gamma );
     }
-    if (reshape_flg)
+    if (rs_input)
     {
         vsi_nn_kernel_tensor_release( &rs_input );
+    }
+    if (rs_output)
+    {
         vsi_nn_kernel_tensor_release( &rs_output );
     }
     for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
diff --git a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
index 238eb23..d6c4b8a 100644
--- a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
@@ -60,6 +60,9 @@ __BEGIN_DECLS
 #define KERNEL_SOURCE_5    "layer_normalization_wh_f16"
 #define KERNEL_SOURCE_6    "layer_normalization_i16"
 #define KERNEL_SOURCE_7    "layer_normalization_wh_i16"
+#define KERNEL_SOURCE_8    "layer_normalization_scale_f32"
+#define KERNEL_SOURCE_9    "layer_normalization_scale_f32_2d"
+#define KERNEL_SOURCE_10   "layer_normalization_scale_f32_bf16"
 
 
 #define HASH_LAYERNORM_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
@@ -68,20 +71,36 @@ __BEGIN_DECLS
 #define HASH_LAYERNORM_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \
     CVIVANTE_NAMESPACE("evis.layer_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D")
 
+#define HASH_LAYERNORM_SH_KERNEL_SCALE_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.layer_norm_"#SRC0_TYPE"F32to"#DST_TYPE)
+
+#define HASH_LAYERNORM_SH_KERNEL_SCALE_2D_NAME(SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.layer_norm_"#SRC0_TYPE"F32to"#DST_TYPE"_2D")
+
 // normalization
-#define HASH_LAYERNORM_KEY(_input0_type, _output_type, _reshape_flag) \
-    ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
+#define HASH_LAYERNORM_KEY(_input0_type, _input2_type, _output_type, _reshape_flag) \
+    ((_input0_type << 24) | (_input2_type << 16) | (_output_type << 8) | _reshape_flag)
 
 #define TENSOR_LAYERNORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_KERNEL), \
+    { HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, LAYERNORM_KERNEL), \
         HASH_LAYERNORM_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
         SOURCE },
 
 #define TENSOR_LAYERNORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_2D_KERNEL), \
+    { HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, LAYERNORM_2D_KERNEL), \
         HASH_LAYERNORM_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
         SOURCE },
 
+#define TENSOR_LAYERNORM_SCALE_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_LAYERNORM_KEY(IN0_TYPE, F32, OUT_TYPE, LAYERNORM_KERNEL), \
+        HASH_LAYERNORM_SH_KERNEL_SCALE_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
+#define TENSOR_LAYERNORM_SCALE_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_LAYERNORM_KEY(IN0_TYPE, F32, OUT_TYPE, LAYERNORM_2D_KERNEL), \
+        HASH_LAYERNORM_SH_KERNEL_SCALE_2D_NAME(IN0_TYPE, OUT_TYPE), \
+        SOURCE },
+
 // greater than max size
 #define HASH_SUMSQR_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
     CVIVANTE_NAMESPACE("evis.layernorm_wh_sumSqr_"#SRC0_TYPE"to"#DST_TYPE)
@@ -96,22 +115,22 @@ __BEGIN_DECLS
     CVIVANTE_NAMESPACE("evis.layernorm_wh_"#SRC0_TYPE"to"#DST_TYPE"_2D")
 
 #define TENSOR_SUMSQR_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, SUMSQR_KERNEL), \
+    { HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, SUMSQR_KERNEL), \
         HASH_SUMSQR_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
         SOURCE },
 
 #define TENSOR_SUMSQR_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, SUMSQR_2D_KERNEL), \
+    { HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, SUMSQR_2D_KERNEL), \
         HASH_SUMSQR_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
         SOURCE },
 
 #define TENSOR_LAYERNORM_WH_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_WH_KERNEL), \
+    { HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, LAYERNORM_WH_KERNEL), \
         HASH_LAYERNORM_WH_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
         SOURCE },
 
 #define TENSOR_LAYERNORM_WH_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_LAYERNORM_KEY(IN0_TYPE, OUT_TYPE, LAYERNORM_WH_2D_KERNEL), \
+    { HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, LAYERNORM_WH_2D_KERNEL), \
         HASH_LAYERNORM_WH_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
         SOURCE },
 
@@ -136,6 +155,17 @@ static const _kernel_map_type _layernorm_kernel_map[] =
     TENSOR_LAYERNORM_KERNELS_2D( F16, U8, KERNEL_SOURCE_2 )
     TENSOR_LAYERNORM_KERNELS( I16, I16, KERNEL_SOURCE_6 )
     TENSOR_LAYERNORM_KERNELS_2D( I16, I16, KERNEL_SOURCE_6 )
+
+    TENSOR_LAYERNORM_SCALE_KERNELS( U8, U8, KERNEL_SOURCE_8 )
+    TENSOR_LAYERNORM_SCALE_KERNELS_2D( U8, U8, KERNEL_SOURCE_9 )
+    TENSOR_LAYERNORM_SCALE_KERNELS( I8, I8, KERNEL_SOURCE_8 )
+    TENSOR_LAYERNORM_SCALE_KERNELS_2D( I8, I8, KERNEL_SOURCE_9 )
+    TENSOR_LAYERNORM_SCALE_KERNELS( I16, I16, KERNEL_SOURCE_8 )
+    TENSOR_LAYERNORM_SCALE_KERNELS_2D( I16, I16, KERNEL_SOURCE_9 )
+    TENSOR_LAYERNORM_SCALE_KERNELS( F16, F16, KERNEL_SOURCE_8 )
+    TENSOR_LAYERNORM_SCALE_KERNELS_2D( F16, F16, KERNEL_SOURCE_9 )
+    TENSOR_LAYERNORM_SCALE_KERNELS( BF16, BF16, KERNEL_SOURCE_10 )
+    TENSOR_LAYERNORM_SCALE_KERNELS_2D( BF16, BF16, KERNEL_SOURCE_10 )
 };
 
 static const _kernel_map_type _sumsqr_kernel_map[] =
@@ -295,8 +325,7 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
     shaderParam.global_scale[1]  = 1;
     shaderParam.global_scale[2]  = 1;
     shaderParam.global_size[0]   = 1;
-    shaderParam.global_size[1]   = gpu_align_p2((height + shaderParam.global_scale[1] - 1)
-                                        / shaderParam.global_scale[1], 4);
+    shaderParam.global_size[1]   = height;
     shaderParam.global_size[2]   = chn;
 
     status = vsi_nn_kernel_gpu_config( node, &shaderParam );
@@ -424,6 +453,37 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
             0x00010001, 0x00010001, 0x00010001, 0x00010001, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
         }, GPU_DP_TYPE_16 };
 
+        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x01050004, 0x03070206, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x05050404, 0x07070606, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniExtractOddData_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x07050301, 0x07050301, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+
         uint32_t pack_key      = 0;
 #define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE )    \
         (IN0_TYPE | (IN1_TYPE << 16) | (OUT_TYPE << 8))
@@ -432,9 +492,6 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
 
         status  = vsi_nn_kernel_gpu_add_param(node, "width", &width);
         status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio);
-        status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4);
-        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4", &uniConvertSecFp16Fp32_4x4);
-        status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
         CHECK_STATUS_FAIL_GOTO(status, OnError );
 
         switch( pack_key )
@@ -453,6 +510,11 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
                         &uniConvert3rdUint8SubZpToFp32_4x4);
                     status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
                         &uniConvert4thUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4",
+                        &uniConvertSecFp16Fp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
+                        &UniFP16toFP32Lo4_dp4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
                     status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
                     status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
                     status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp);
@@ -481,6 +543,11 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
                         &uniConvert3rdUint8SubZpToFp32_4x4);
                     status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
                         &uniConvert4thUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4",
+                        &uniConvertSecFp16Fp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
+                        &UniFP16toFP32Lo4_dp4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
                     status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
                     status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
                     status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp);
@@ -501,7 +568,11 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
                         &uniConvert2ndUint8SubZpToFp32_4x4);
                     status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
                         &uniConvertInt32toUint8_2x8);
-
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4",
+                        &uniConvertSecFp16Fp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
+                        &UniFP16toFP32Lo4_dp4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
                     status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
                     status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
                     status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
@@ -510,6 +581,70 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer)
                     CHECK_STATUS_FAIL_GOTO(status, OnError );
                 }
                 break;
+            case _PACK_SELECT_KEY( U8,  F32, U8 ):
+            case _PACK_SELECT_KEY( F16, F32, F16 ):
+                {
+                    status = vsi_nn_kernel_gpu_add_param(node, "uniFp16SumSqr_dp8x2",
+                        &uniFp16SumSqr_dp8x2);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractHalf4_dp4x4",
+                        &uniExtractHalf4_dp4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
+                        &uniConvertInt32toUint8_2x8);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniSumU8_16x1", &uniSumU8_16x1);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniSqrSum_16x1", &uniSqrSum_16x1);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
+                        &uniConvert1stUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
+                        &uniConvert2ndUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert3rdUint8SubZpToFp32_4x4",
+                        &uniConvert3rdUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4",
+                        &uniConvert4thUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
+                        &UniFP16toFP32Lo4_dp4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "sumInZp", &sumInZp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp1", &tmpZp1);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "tmpZp2", &tmpZp2);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut);
+                    CHECK_STATUS_FAIL_GOTO(status, OnError );
+                }
+                break;
+            case _PACK_SELECT_KEY( I16, F32, I16 ):
+                {
+                    status = vsi_nn_kernel_gpu_add_param(node, "uniInt16SumSqr_dp8x2",
+                        &uniInt16SumSqr_dp8x2);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert1stUint8SubZpToFp32_4x4",
+                        &uniConvert1stUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert2ndUint8SubZpToFp32_4x4",
+                        &uniConvert2ndUint8SubZpToFp32_4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8",
+                        &uniConvertInt32toUint8_2x8);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4",
+                        &UniFP16toFP32Lo4_dp4x4);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &scaleOut);
+                    status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio_scale", &dimRatio_scale);
+                    CHECK_STATUS_FAIL_GOTO(status, OnError );
+                }
+                break;
+            case _PACK_SELECT_KEY( BF16, F32, BF16 ):
+                {
+                    status  = vsi_nn_kernel_gpu_add_param( node,
+                                "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                                "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                                "uniExtractOddData_2x8", &uniExtractOddData_2x8 );
+                    CHECK_STATUS_FAIL_GOTO(status, OnError );
+                }
+                break;
             default:
                 VSI_ASSERT( FALSE );
                 return VSI_FAILURE;
@@ -949,6 +1084,7 @@ static vsi_status _query_kernel
 {
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e input2_dtype = F16;
     vsi_nn_kernel_dtype_e output_dtype = U8;
     uint32_t key = 0;
     int i = 0;
@@ -960,9 +1096,10 @@ static vsi_status _query_kernel
     }
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    key = HASH_LAYERNORM_KEY( input0_dtype, output_dtype, kernel_type );
+    key = HASH_LAYERNORM_KEY( input0_dtype, input2_dtype, output_dtype, kernel_type );
 
     for( i = 0; i < _cnt_of_array(_layernorm_kernel_map); i ++ )
     {
@@ -1000,14 +1137,16 @@ static vsi_status _query_kernel_wh
 {
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e input2_dtype = F16;
     vsi_nn_kernel_dtype_e output_dtype = U8;
     uint32_t key = 0;
     int i = 0;
 
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    key = HASH_LAYERNORM_KEY( input0_dtype, F32, is2D_sumsqr );
+    key = HASH_LAYERNORM_KEY( input0_dtype, input2_dtype, F32, is2D_sumsqr );
 
     for( i = 0; i < _cnt_of_array(_sumsqr_kernel_map); i ++ )
     {
@@ -1031,7 +1170,7 @@ static vsi_status _query_kernel_wh
     }
 
 
-    key = HASH_LAYERNORM_KEY( input0_dtype, output_dtype, is2D_wh );
+    key = HASH_LAYERNORM_KEY( input0_dtype, input2_dtype, output_dtype, is2D_wh );
 
     for( i = 0; i < _cnt_of_array(_sumsqr_kernel_map); i ++ )
     {
@@ -1256,17 +1395,25 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_LAYERNORM_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_t node = NULL;
     vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL, rs_gamma = NULL, rs_beta = NULL;
-    int32_t rs_flg = vsi_nn_kernel_param_get_int32( params, "reshape_flg" );
-    int32_t wh_flg = vsi_nn_kernel_param_get_int32( params, "wh_flg" );
-    int32_t optFlg = rs_flg || (outputs[0]->attr.dim_num < 3);
     float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
+    uint32_t *input_size = inputs[0]->attr.size;
+    uint32_t dims_num = inputs[0]->attr.dim_num;
+    int32_t rs_flg = 0;
+    int32_t optFlg = 0;
 
-    if (wh_flg)
+    if (input_size[0] >= GPU_TENSOR_MAX_WIDTH)
     {
        node = _setup_wh(graph, inputs, input_num, outputs, output_num, params, kernel);
        goto final;
     }
 
+    if ((input_size[1] * input_size[2] < GPU_TENSOR_MAX_WIDTH)
+        && dims_num > 2)
+    {
+        rs_flg = 1;
+    }
+    optFlg = rs_flg || (outputs[0]->attr.dim_num < 3);
+
     status = _query_kernel( inputs, outputs, kernel, optFlg);
     if (VSI_SUCCESS != status)
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
index 5b896c6..68dc6e8 100644
--- a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
@@ -241,7 +241,7 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
     pack_key = _PACK_SELECT_KEY( attr[0]->dtype,
             attr[1]->dtype, attr[2]->dtype );
 
-    if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16)
+    if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == BF16)
         || ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16)) )
     {
         gpu_param.global_scale[0] = 8;
diff --git a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
index 356b93f..02d7523 100644
--- a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
@@ -241,7 +241,7 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
     pack_key = _PACK_SELECT_KEY( attr[0]->dtype,
             attr[1]->dtype, attr[2]->dtype );
 
-    if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16)
+    if ( (attr[2]->dtype == F16 || attr[2]->dtype == I16 || attr[2]->dtype == BF16)
         || ((attr[2]->dtype == I8 || attr[2]->dtype == U8) && (attr[0]->dtype == F16 && attr[1]->dtype == F16)) )
     {
         gpu_param.global_scale[0] = 8;
diff --git a/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c
new file mode 100644
index 0000000..f1798c2
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c
@@ -0,0 +1,460 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_dtype_util.h"
+
+__BEGIN_DECLS
+
+#define _ONE_HOT_KERNEL_SOURCE      "one_hot"
+
+// Add kernel hashtable here
+#define HASH_ONE_HOT_SH_KERNEL_NAME(SRC_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.one_hot_"#SRC_TYPE"to"#DST_TYPE)
+
+#define ONE_HOT_HASH_KEY( IN_DTYPE, OUT_DTYPE, IMG_2D ) \
+        (( IN_DTYPE << 9 ) | ( OUT_DTYPE << 1) | (IMG_2D))
+
+#define PACK_ONE_HOT_KERNEL_3D( IN_DTYPE, OUT_DTYPE ) \
+{ ONE_HOT_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0 ), \
+  CVIVANTE_NAMESPACE("evis.one_hot_"#IN_DTYPE"to"#OUT_DTYPE), \
+  _ONE_HOT_KERNEL_SOURCE }
+
+#define PACK_ONE_HOT_KERNEL_2D( IN_DTYPE, OUT_DTYPE ) \
+{ ONE_HOT_HASH_KEY( IN_DTYPE, OUT_DTYPE, 1 ), \
+  CVIVANTE_NAMESPACE("evis.one_hot_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \
+  _ONE_HOT_KERNEL_SOURCE }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _one_hot_kernel_map[] =
+{
+    // Register kernel here
+    PACK_ONE_HOT_KERNEL_3D( U8,  U8 ),
+    PACK_ONE_HOT_KERNEL_3D( U8,  F16 ),
+    PACK_ONE_HOT_KERNEL_3D( I8,  I8 ),
+    PACK_ONE_HOT_KERNEL_3D( I8,  F16 ),
+    PACK_ONE_HOT_KERNEL_3D( I16, I16 ),
+    PACK_ONE_HOT_KERNEL_3D( I16, F16 ),
+    PACK_ONE_HOT_KERNEL_3D( F16, F16 ),
+    PACK_ONE_HOT_KERNEL_3D( F16, I16 ),
+    PACK_ONE_HOT_KERNEL_3D( F16, U8 ),
+    PACK_ONE_HOT_KERNEL_3D( F16, I8 ),
+
+    PACK_ONE_HOT_KERNEL_2D( U8,  U8 ),
+    PACK_ONE_HOT_KERNEL_2D( U8,  F16 ),
+    PACK_ONE_HOT_KERNEL_2D( I8,  I8 ),
+    PACK_ONE_HOT_KERNEL_2D( I8,  F16 ),
+    PACK_ONE_HOT_KERNEL_2D( I16, I16 ),
+    PACK_ONE_HOT_KERNEL_2D( I16, F16 ),
+    PACK_ONE_HOT_KERNEL_2D( F16, F16 ),
+    PACK_ONE_HOT_KERNEL_2D( F16, I16 ),
+    PACK_ONE_HOT_KERNEL_2D( F16, U8 ),
+    PACK_ONE_HOT_KERNEL_2D( F16, I8 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _one_hot_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define SCALAR_INPUT_SUFFIX_SIZE     (2)
+#define SCALAR_INPUT_ON_VALUE        (3)
+#define SCALAR_INPUT_OFF_VALUE       (4)
+#define _ONE_HOT_PARAM_NUM  _cnt_of_array( _one_hot_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_one_hot_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        2,         // workdim
+        {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0}, // localWorkSize: local group size in thread
+        {0, 0, 0}  // globalWorkSize: image size in thread
+        };
+
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+    vsi_int_array_t * in_shape = NULL;
+    int32_t   suffix_size       = 0;
+    int32_t   depth             = 0;
+    int32_t   input_zp          = 0;
+    float     scaleIn           = 1.0f;
+    int32_t   srcFixPointPos    = 0;
+    vsi_nn_kernel_dtype_e input_dtype  = F16;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_SUFFIX_SIZE], &(suffix_size));
+
+    in_shape = attr[0]->shape;
+    depth = attr[1]->shape->data[1];
+    input_dtype  = attr[0]->dtype;
+
+    if (VSI_NN_KERNEL_QUANT_DFP == attr[0]->quant)
+    {
+        srcFixPointPos   = attr[0]->dfp.fl;
+    }
+    else if (VSI_NN_KERNEL_QUANT_ASYMM == attr[0]->quant)
+    {
+        input_zp = attr[0]->asymm.zero_point;
+        scaleIn  = attr[0]->asymm.scale;
+    }
+
+    if (suffix_size == 1)
+    {
+        gpu_param.global_scale[0] = 4;
+        gpu_param.global_scale[1] = 1;
+
+        depth = attr[1]->shape->data[0];
+    }
+    else
+    {
+        gpu_param.global_scale[0] = 1;
+        gpu_param.global_scale[1] = 1;
+    }
+
+    gpu_param.global_size[0] = gpu_align_p2(
+            (in_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = in_shape->data[1];
+
+    switch (input_dtype)
+    {
+    case I16:
+    case I8:
+    case F16:
+    {
+        gpu_dp_inst_t uniDataConvert_0_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniDataConvert_1_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniExtractInteger_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_update_postshfit( &uniDataConvert_0_4x4, srcFixPointPos );
+        gpu_dp_inst_update_postshfit( &uniDataConvert_1_4x4, srcFixPointPos );
+
+        status = vsi_nn_kernel_gpu_add_param( node,
+            "uniDataConvert_0_4x4", &uniDataConvert_0_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+            "uniDataConvert_1_4x4", &uniDataConvert_1_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+            "uniExtract8Data_2x8", &uniExtractInteger_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+            "depth", &depth );
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    break;
+    case U8:
+    {
+        gpu_dp_inst_t uniDataConvert_0_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniDataConvert_1_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniExtractInteger_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        float input_tail = 0 - (float)input_zp * scaleIn;
+
+        status = vsi_nn_kernel_gpu_add_param( node,
+            "uniDataConvert_0_4x4", &uniDataConvert_0_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+            "uniDataConvert_1_4x4", &uniDataConvert_1_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+            "uniExtract8Data_2x8", &uniExtractInteger_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+            "input_scale", &scaleIn );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+            "input_tail", &input_tail );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+            "depth", &depth );
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    default:
+        break;
+    }
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(attr[0]);
+    SAFE_FREE_TENSOR_ATTR(attr[1]);
+#undef SAFE_FREE_TENSOR_ATTR
+
+    return status;
+} /* _one_hot_initializer() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    vsi_bool image_2d
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _one_hot_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _one_hot_kernel_map );
+    vx_param_description_t * param_def  = _one_hot_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _one_hot_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = ONE_HOT_HASH_KEY( in_dtype, out_dtype, image_2d );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _one_hot_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_ONE_HOT_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_tensor_t* rs_tensors[2] = { NULL };
+    int32_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
+    int32_t i = 0;
+    vsi_bool image_2d = FALSE;
+    int32_t num_elements = vsi_nn_vxGetTensorElementNum(&inputs[0]->attr);
+    int32_t prefix_dim_size = 1;
+    int32_t suffix_dim_size = 0;
+    int32_t depth = vsi_nn_kernel_param_get_int32( params, "depth" );
+    uint32_t data_u32[2] = {0};
+    float on_value = vsi_nn_kernel_param_get_float32( params, "on_value" );
+    float off_value = vsi_nn_kernel_param_get_float32( params, "off_value" );
+    int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
+
+    vsi_nn_Float32ToDtype(on_value, (uint8_t*)&data_u32[0], &outputs[0]->attr.dtype);
+    vsi_nn_Float32ToDtype(off_value, (uint8_t*)&data_u32[1], &outputs[0]->attr.dtype);
+
+    axis = axis == -1 ? (int32_t)inputs[0]->attr.dim_num : (int32_t)inputs[0]->attr.dim_num - axis;
+    for (i = 0; i < axis; i++)
+    {
+        prefix_dim_size *= inputs[0]->attr.size[i];
+    }
+
+    suffix_dim_size = num_elements / prefix_dim_size;
+
+    if (suffix_dim_size == 1)
+    {
+        shape[0][0] = prefix_dim_size;
+        shape[0][1] = 1;
+        shape[1][0] = depth;
+        shape[1][1] = prefix_dim_size;
+        shape[1][2] = 1;
+    }
+    else
+    {
+        shape[0][0] = suffix_dim_size;
+        shape[0][1] = prefix_dim_size;
+        shape[1][0] = suffix_dim_size;
+        shape[1][1] = depth;
+        shape[1][2] = prefix_dim_size;
+    }
+
+    rs_tensors[0] = vsi_nn_reshape_tensor( graph,
+        inputs[0], (uint32_t*)shape[0], 2 );
+    rs_tensors[1] = vsi_nn_reshape_tensor( graph,
+        outputs[0], (uint32_t*)shape[1], 3 );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[1]->attr.size,
+                rs_tensors[1]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    image_2d = suffix_dim_size == 1;
+
+    status = _query_kernel( kernel, inputs, outputs, image_2d );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _ONE_HOT_PARAM_NUM,
+                    &rs_tensors[0], input_num, &rs_tensors[1], output_num );
+            node_params[SCALAR_INPUT_SUFFIX_SIZE] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &suffix_dim_size );
+            node_params[SCALAR_INPUT_ON_VALUE] = vsi_nn_kernel_scalar_create(
+                graph, U32, &data_u32[0] );
+            node_params[SCALAR_INPUT_OFF_VALUE] = vsi_nn_kernel_scalar_create(
+                graph, U32, &data_u32[1] );
+
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _ONE_HOT_PARAM_NUM );
+            CHECK_STATUS_FAIL_GOTO( status, final );
+        }
+    }
+final:
+    if (rs_tensors[0])
+    {
+        vsi_nn_ReleaseTensor( &rs_tensors[0] );
+    }
+
+    if (rs_tensors[1])
+    {
+        vsi_nn_ReleaseTensor( &rs_tensors[1] );
+    }
+
+    for (i = SCALAR_INPUT_SUFFIX_SIZE; i < _ONE_HOT_PARAM_NUM; i++)
+    {
+        if (node_params[i])
+        {
+            vsi_nn_kernel_scalar_release( &node_params[i] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( one_hot, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
index 09f55a6..a7a6cb1 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
@@ -202,8 +202,28 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
         }, GPU_DP_TYPE_16 };
         gpu_dp_inst_t uniExtractRtoF32_part1_4x4 = {{
             0x01010101, // TCfg
-            0x01010100, // ASelt
-            0x0000000c, 0x00060003, // ABin
+            0x01010000, // ASelt
+            0x000f000c, 0x00050002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractRtoF32_part2_4x4 = {{
+            0x01010101, // TCfg
+            0x01000000, // ASelt
+            0x000b0008, 0x0001000e, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractRtoF32_part3_4x4 = {{
+            0x01010101, // TCfg
+            0x01010101, // ASelt
+            0x00070004, 0x000d000a, // ABin
             0x02020202, // BSelt
             0x00000000, 0x00000000, // BBin
             0x00000600, // AccumType, ConstantType, and PostShift
@@ -223,7 +243,27 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
         gpu_dp_inst_t uniExtractGtoF32_part1_4x4 = {{
             0x01010101, // TCfg
             0x01010100, // ASelt
-            0x0001000d, 0x00070004, // ABin
+            0x0000000d, 0x00060003, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractGtoF32_part2_4x4 = {{
+            0x01010101, // TCfg
+            0x01000000, // ASelt
+            0x000c0009, 0x0002000f, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractGtoF32_part3_4x4 = {{
+            0x01010101, // TCfg
+            0x01010101, // ASelt
+            0x00080005, 0x000e000b, // ABin
             0x02020202, // BSelt
             0x00000000, 0x00000000, // BBin
             0x00000600, // AccumType, ConstantType, and PostShift
@@ -243,7 +283,27 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
         gpu_dp_inst_t uniExtractBtoF32_part1_4x4 = {{
             0x01010101, // TCfg
             0x01010100, // ASelt
-            0x0002000e, 0x00080005, // ABin
+            0x0001000e, 0x00070004, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractBtoF32_part2_4x4 = {{
+            0x01010101, // TCfg
+            0x01010000, // ASelt
+            0x000d000a, 0x00030000, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractBtoF32_part3_4x4 = {{
+            0x01010101, // TCfg
+            0x01010101, // ASelt
+            0x00090006, 0x000f000c, // ABin
             0x02020202, // BSelt
             0x00000000, 0x00000000, // BBin
             0x00000600, // AccumType, ConstantType, and PostShift
@@ -358,7 +418,14 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
         case _PACK_SELECT_KEY( 1, 0, 0):  // copy
         case _PACK_SELECT_KEY( 1, 2, 0):  // copy  reorder
             {
-                shaderParam.global_scale[0]  = 8;
+                if (attr[0]->dtype == I8 || attr[0]->dtype == U8)
+                {
+                    shaderParam.global_scale[0]  = 16;
+                }
+                else
+                {
+                    shaderParam.global_scale[0]  = 8;
+                }
                 shaderParam.global_scale[1]  = 1;
                 shaderParam.global_scale[2]  = 1;
                 shaderParam.global_size[0]   = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
@@ -366,7 +433,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
                 shaderParam.global_size[1]   = height;
                 shaderParam.global_size[2]   = 1;
 
-                if(attr[0]->dtype == F16)
+                if (attr[0]->dtype == F16)
                 {
                     status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractHalf8_2x8);
                 }
@@ -376,10 +443,16 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
                 }
                 status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractRtoF32_part0_4x4", &uniExtractRtoF32_part0_4x4);
                 status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractRtoF32_part1_4x4", &uniExtractRtoF32_part1_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractRtoF32_part2_4x4", &uniExtractRtoF32_part2_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractRtoF32_part3_4x4", &uniExtractRtoF32_part3_4x4);
                 status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractGtoF32_part0_4x4", &uniExtractGtoF32_part0_4x4);
                 status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractGtoF32_part1_4x4", &uniExtractGtoF32_part1_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractGtoF32_part2_4x4", &uniExtractGtoF32_part2_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractGtoF32_part3_4x4", &uniExtractGtoF32_part3_4x4);
                 status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBtoF32_part0_4x4", &uniExtractBtoF32_part0_4x4);
                 status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBtoF32_part1_4x4", &uniExtractBtoF32_part1_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBtoF32_part2_4x4", &uniExtractBtoF32_part2_4x4);
+                status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractBtoF32_part3_4x4", &uniExtractBtoF32_part3_4x4);
                 status |= vsi_nn_kernel_gpu_add_param(node, "r_order", &reorder);
                 status |= vsi_nn_kernel_gpu_add_param(node, "b_order", &order1);
                 CHECK_STATUS_FAIL_GOTO(status, OnError);
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c
index 2d32371..7ab900b 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c
@@ -43,6 +43,7 @@ __BEGIN_DECLS
 #define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOU8     CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toU8")
 #define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOI8     CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toI8")
 #define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOU8      CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toU8")
+#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOF16     CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toF16")
 
 #define KERNEL_SOURCE_1    "pre_process_yuv420_scale_u8",
 #define KERNEL_SOURCE_2    "pre_process_yuv420_copy_u8",
@@ -77,6 +78,7 @@ static const struct {
     TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8,  SCALE,        KERNEL_SOURCE_1)
     TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I8,  SCALE,        KERNEL_SOURCE_5)
     TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8,  COPY,         KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, F16, COPY,         KERNEL_SOURCE_2)
 };
 
 static vx_param_description_t vxPreProcessYuv420Kernel_param_def[] =
@@ -155,10 +157,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
     }
 
     shaderParam.global_scale[0]  = 16;
-    if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
-    {
-        shaderParam.global_scale[0]  = 8;
-    }
     shaderParam.global_scale[1]  = 1;
     shaderParam.global_scale[2]  = 1;
     shaderParam.global_size[0]   = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
@@ -418,6 +416,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
         switch( attr[0]->dtype )
         {
         case U8:
+        case F16:
             {
                 // R
                 status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpR1st_4x4", &uniCalculateTmpR1st_4x4);
@@ -866,7 +865,7 @@ static vsi_status _query_kernel
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    if (enable_copy && output_dtype == U8)
+    if (enable_copy && (output_dtype == U8 || output_dtype == F16))
     {
         convert_type = COPY;
     }
@@ -890,7 +889,7 @@ static vsi_status _query_kernel
         kernel->info.parameters = vxPreProcessYuv420Kernel_param_def;
         kernel->info.numParams = _cnt_of_array( vxPreProcessYuv420Kernel_param_def );
 
-        if (enable_copy && output_dtype == U8)
+        if (enable_copy && (output_dtype == U8 || output_dtype == F16))
         {
             kernel->info.initialize = _pre_process_yuv420_copy_initializer;
         }
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
index 7d51d43..262aa5d 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
@@ -43,6 +43,7 @@ __BEGIN_DECLS
 #define VX_KERNEL_NAME_PRE_PROCESS_YUV444_SCALE_U8TOU8     CVIVANTE_NAMESPACE("evis.pre_process_yuv444_scale_U8toU8")
 #define VX_KERNEL_NAME_PRE_PROCESS_YUV444_SCALE_U8TOI8     CVIVANTE_NAMESPACE("evis.pre_process_yuv444_scale_U8toI8")
 #define VX_KERNEL_NAME_PRE_PROCESS_YUV444_COPY_U8TOU8      CVIVANTE_NAMESPACE("evis.pre_process_yuv444_copy_U8toU8")
+#define VX_KERNEL_NAME_PRE_PROCESS_YUV444_COPY_U8TOF16     CVIVANTE_NAMESPACE("evis.pre_process_yuv444_copy_U8toF16")
 
 #define KERNEL_SOURCE_1    "pre_process_yuv444_scale",
 #define KERNEL_SOURCE_3    "pre_process_yuv444_scale_fp16",
@@ -75,6 +76,7 @@ static const struct {
     TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8,  SCALE,        KERNEL_SOURCE_1)
     TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, I8,  SCALE,        KERNEL_SOURCE_1)
     TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, U8,  COPY,         KERNEL_SOURCE_4)
+    TENSOR_PRE_PROCESS_YUV444_KERNELS(U8, F16, COPY,         KERNEL_SOURCE_4)
 };
 
 static vx_param_description_t vxPreProcessYuv444Kernel_param_def[] =
@@ -145,10 +147,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
     }
 
     shaderParam.global_scale[0]  = 16;
-    if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
-    {
-        shaderParam.global_scale[0]  = 8;
-    }
     shaderParam.global_scale[1]  = 1;
     shaderParam.global_scale[2]  = 1;
     shaderParam.global_size[0]   = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
@@ -400,6 +398,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
         switch( attr[0]->dtype )
         {
         case U8:
+        case F16:
             {
                 // R
                 status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpR1st_4x4", &uniCalculateTmpR1st_4x4);
@@ -841,7 +840,7 @@ static vsi_status _query_kernel
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    if (enable_copy && output_dtype == U8)
+    if (enable_copy && (output_dtype == U8 || output_dtype == F16))
     {
         convert_type = COPY;
     }
@@ -865,7 +864,7 @@ static vsi_status _query_kernel
         kernel->info.parameters = vxPreProcessYuv444Kernel_param_def;
         kernel->info.numParams = _cnt_of_array( vxPreProcessYuv444Kernel_param_def );
 
-        if (enable_copy && output_dtype == U8)
+        if (enable_copy && (output_dtype == U8 || output_dtype == F16))
         {
             kernel->info.initialize = _pre_process_yuv444_copy_initializer;
         }
diff --git a/src/tim/vx/internal/src/kernel/evis/repeat_evis.c b/src/tim/vx/internal/src/kernel/evis/repeat_evis.c
new file mode 100644
index 0000000..421caca
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/repeat_evis.c
@@ -0,0 +1,609 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+
+#define KERNEL_SOURCE_1    "repeat"
+#define KERNEL_SOURCE_2    "repeat_axis1"
+
+#define HASH_PREPROCESS_STARTID_SH_KERNEL_NAME \
+    CVIVANTE_NAMESPACE("evis.preprocess_start_idx")
+
+#define HASH_REPEAT_SH_KERNEL_1D_NAME(SRC0_TYPE) \
+    CVIVANTE_NAMESPACE("evis.repeat_"#SRC0_TYPE"_1D")
+
+#define HASH_REPEAT_SH_KERNEL_NAME(SRC0_TYPE, AXIS) \
+    CVIVANTE_NAMESPACE("evis.repeat_"#SRC0_TYPE"_axis"#AXIS)
+
+// Add kernel hashtable here
+#define HASH_PREPROCESS_KEY(_input0_type, _output_type) \
+    ((_input0_type << 24) | (_output_type << 16))
+
+#define HASH_REPEAT_KEY(_input0_type, _output_type, _is1d, _axis) \
+    ((_input0_type << 24) | (_output_type << 16) | (_is1d << 8) | _axis)
+
+#define TENSOR_PREPROCESS_STARTID_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_PREPROCESS_KEY(IN0_TYPE, OUT_TYPE), \
+        HASH_PREPROCESS_STARTID_SH_KERNEL_NAME, \
+        SOURCE },
+
+#define TENSOR_REPEAT_KERNELS(IN0_TYPE, OUT_TYPE, AXIS, SOURCE) \
+    { HASH_REPEAT_KEY(IN0_TYPE, OUT_TYPE, 0, AXIS), \
+        HASH_REPEAT_SH_KERNEL_NAME(IN0_TYPE, AXIS), \
+        SOURCE },
+
+#define TENSOR_REPEAT_1D_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_REPEAT_KEY(IN0_TYPE, OUT_TYPE, 1, 0), \
+        HASH_REPEAT_SH_KERNEL_1D_NAME(IN0_TYPE), \
+        SOURCE },
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _preprocess_kernel_map[] =
+{
+    // Register kernel here
+    TENSOR_PREPROCESS_STARTID_KERNELS( I32, I32, KERNEL_SOURCE_1 )
+};
+
+static const _kernel_map_type _repeat_kernel_map[] =
+{
+    // Register kernel here
+    TENSOR_REPEAT_KERNELS( U8,  U8,  0, KERNEL_SOURCE_1 )
+    TENSOR_REPEAT_KERNELS( U8,  U8,  1, KERNEL_SOURCE_2 )
+    TENSOR_REPEAT_KERNELS( U8,  U8,  2, KERNEL_SOURCE_1 )
+    TENSOR_REPEAT_KERNELS( I16, I16, 0, KERNEL_SOURCE_1 )
+    TENSOR_REPEAT_KERNELS( I16, I16, 1, KERNEL_SOURCE_2 )
+    TENSOR_REPEAT_KERNELS( I16, I16, 2, KERNEL_SOURCE_1 )
+
+    TENSOR_REPEAT_1D_KERNELS( U8,  U8,  KERNEL_SOURCE_1 )
+    TENSOR_REPEAT_1D_KERNELS( I16, I16, KERNEL_SOURCE_1 )
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _preprocess_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _REPEAT_PREPROCESS_PARAM_NUM  _cnt_of_array( _preprocess_kernel_param_def )
+
+static vx_param_description_t _repeat_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _REPEAT_PARAM_NUM  _cnt_of_array( _repeat_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_preprocess_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        3,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    vsi_nn_kernel_tensor_attr_t* attr[1] = {NULL};
+    int32_t width = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+
+    width = attr[0]->shape->data[0];
+
+    shaderParam.global_scale[0]  = 16;
+    shaderParam.global_scale[1]  = 1;
+    shaderParam.global_scale[2]  = 1;
+    shaderParam.local_size[0]    = 32;
+    shaderParam.local_size[1]    = 1;
+    shaderParam.local_size[2]    = 1;
+    shaderParam.global_size[0]   = 32;
+    shaderParam.global_size[1]   = 1;
+    shaderParam.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        gpu_dp_inst_t uniIntegralHorAcc_4x4 = {{
+                0xff3f0f03, // TCfg
+                0x00000000, // ASelt
+                0x00100000, 0x32100210, // ABin
+                0x00000000, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002600, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        status  = vsi_nn_kernel_gpu_add_param(node, "uniIntegralHorAcc_4x4", &uniIntegralHorAcc_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+
+    return status;
+}
+
+DEF_KERNEL_INITIALIZER(_repeat_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        3,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    vsi_nn_kernel_tensor_attr_t* attr[1] = {NULL};
+    vsi_int_array_t * input_shape = NULL;
+    int32_t height = 0, width = 0, chn = 0;
+    int32_t is1d = 0;
+    int32_t axis = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &axis);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    input_shape  = attr[0]->shape;
+    width = input_shape->data[0];
+    height = input_shape->data[1];
+    if (height == 1 && input_shape->size == 2)
+    {
+        is1d = 1;
+    }
+    chn = input_shape->size > 2 ? input_shape->data[2] : 1;
+
+    if ((axis == 0 && is1d == 0) || axis == 2)
+    {
+        shaderParam.global_scale[0]  = 16;
+        if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
+        {
+            shaderParam.global_scale[0]  = 8;
+        }
+
+        shaderParam.global_scale[1]  = 1;
+        shaderParam.global_scale[2]  = 1;
+    }
+    else if (is1d)
+    {
+        shaderParam.global_scale[0]  = 1;
+        shaderParam.global_scale[1]  = 1;
+        shaderParam.global_scale[2]  = 1;
+    }
+    else if (axis == 1)
+    {
+        shaderParam.global_scale[0]  = 1;
+        shaderParam.global_scale[1]  = 8;
+        shaderParam.global_scale[2]  = 1;
+    }
+    shaderParam.global_size[0]   = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
+        / shaderParam.global_scale[0], 4);
+    shaderParam.global_size[1]   = (height + shaderParam.global_scale[1] - 1)
+        / shaderParam.global_scale[1];
+    shaderParam.global_size[2]   = chn;
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        gpu_dp_inst_t uniExtract1to8Short_2x8 = {{
+                0x11111111, // TCfg
+                0x00000000, // ASelt
+                0x00000000, 0x00000000, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        status = vsi_nn_kernel_gpu_add_param(node, "uniExtract1to8Short_2x8", &uniExtract1to8Short_2x8);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+
+    return status;
+}
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel_preprocess,
+    vsi_nn_kernel_t* kernel,
+    int32_t axis
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e input1_dtype = I32;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    uint32_t key = 0;
+    int32_t is1d = inputs[0]->attr.dim_num == 1 ? 1 : 0;
+    int i = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (input0_dtype == F16)
+    {
+        input0_dtype = I16;
+    }
+    if (output_dtype == F16)
+    {
+        output_dtype = I16;
+    }
+
+    if (input0_dtype == I8)
+    {
+        input0_dtype = U8;
+    }
+    if (output_dtype == I8)
+    {
+        output_dtype = U8;
+    }
+
+    key = HASH_PREPROCESS_KEY( input1_dtype, I32 );
+
+    for( i = 0; i < _cnt_of_array(_preprocess_kernel_map); i ++ )
+    {
+        if ( _preprocess_kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(_preprocess_kernel_map) )
+    {
+        snprintf( kernel_preprocess->info.name, VX_MAX_KERNEL_NAME, "%s",  _preprocess_kernel_map[i].function_name );
+        kernel_preprocess->info.parameters = _preprocess_kernel_param_def;
+        kernel_preprocess->info.numParams = _REPEAT_PREPROCESS_PARAM_NUM;
+        kernel_preprocess->info.initialize = _preprocess_initializer;
+
+        vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                _preprocess_kernel_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                _preprocess_kernel_map[i].source_name );
+    }
+
+
+    key = HASH_REPEAT_KEY( input0_dtype, output_dtype, is1d, axis );
+
+    for( i = 0; i < _cnt_of_array(_repeat_kernel_map); i ++ )
+    {
+        if ( _repeat_kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(_repeat_kernel_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _repeat_kernel_map[i].function_name );
+        kernel->info.parameters = _repeat_kernel_param_def;
+        kernel->info.numParams = _REPEAT_PARAM_NUM;
+        kernel->info.initialize = _repeat_initializer;
+
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                _repeat_kernel_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                _repeat_kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+static int32_t _optimize_repeat_shape
+    (
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    int32_t* axis,
+    int32_t* opt_shape_in,
+    int32_t* opt_shape_out,
+    int32_t* new_rank
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    if (inputs[0]->attr.dim_num == 1)
+    {
+        opt_shape_in[0] = inputs[0]->attr.size[0];
+        opt_shape_in[1] = 1;
+        opt_shape_out[0] = outputs[0]->attr.size[0];
+        opt_shape_out[1] = 1;
+        new_rank[0] = 2;
+        new_rank[1] = 2;
+    }
+    else if (axis[0] == 3)
+    {
+        vsi_nn_kernel_optimize_element_shape( (int32_t*)inputs[0]->attr.size, 3, opt_shape_in, new_rank );
+        if (opt_shape_in[1] == 1)
+        {
+            opt_shape_in[1] = inputs[0]->attr.size[3];
+            opt_shape_out[0] = opt_shape_in[0];
+            opt_shape_out[1] = outputs[0]->attr.size[3];
+            axis[0] = 0;
+            new_rank[0] = 2;
+            new_rank[1] = 2;
+        }
+        else if (new_rank[0] == 2)
+        {
+            opt_shape_in[2] = inputs[0]->attr.size[3];
+            opt_shape_out[0] = opt_shape_in[0];
+            opt_shape_out[1] = opt_shape_in[1];
+            opt_shape_out[2] = outputs[0]->attr.size[3];
+            axis[0] = 2;
+            new_rank[0] = 3;
+            new_rank[1] = 3;
+        }
+        else
+        {
+            status = VSI_FAILURE;
+        }
+    }
+
+    return status;
+}
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t preprocess_node_params[_REPEAT_PREPROCESS_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_param_t node_params[_REPEAT_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_t tmp_node = NULL;
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_tensor_attr_t attr;
+    vsi_nn_kernel_t * kernel_preprocess = NULL;
+    vsi_nn_tensor_t * tensor_preprocess = NULL;
+    vsi_nn_kernel_tensor_t rs_input = NULL, rs_input1 = NULL, rs_output = NULL;
+    int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }, { 1, 1, 1, 1 }};
+    int32_t new_rank[2] = {0, 0};
+    int32_t axis  = vsi_nn_kernel_param_get_int32( params, "axis" );
+
+    // Check if gpu can support the size
+    if ( !vsi_nn_kernel_gpu_check_shape(
+        (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    if (axis > 2 || outputs[0]->attr.dim_num == 1)
+    {
+        status = _optimize_repeat_shape(inputs, outputs, &axis, new_shape[0], new_shape[1], new_rank);
+        if ( VSI_SUCCESS != status )
+        {
+            goto final;
+        }
+        rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape[0], new_rank[0]);
+        rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[1], new_rank[1]);
+    }
+
+    if (inputs[1]->attr.dim_num == 1)
+    {
+        new_shape[0][0] = inputs[1]->attr.size[0];
+        new_shape[0][1] = 1;
+        rs_input1 = vsi_nn_kernel_tensor_reshape(inputs[1]->t, new_shape[0], 2);
+    }
+
+    kernel_preprocess = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
+    // Assign unique_id
+    kernel_preprocess->unique_id = kernel->unique_id;
+
+    status = _query_kernel( inputs, outputs, kernel_preprocess, kernel, axis );
+    if ( VSI_SUCCESS != status )
+    {
+        goto final;
+    }
+
+    memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
+    attr.dtype.vx_type = VSI_NN_TYPE_INT32;
+    attr.is_const = FALSE;
+    attr.vtl = TRUE;
+    attr.size[0] = inputs[1]->attr.size[0];
+    attr.size[1] = 1;
+    attr.dim_num = 2;
+    tensor_preprocess = vsi_nn_CreateTensor( graph, &attr );
+
+    // preprocess
+    tmp_node = vsi_nn_kernel_create_node( graph, kernel_preprocess );
+    if (tmp_node)
+    {
+        uint32_t index = 0;
+        if (rs_input1)
+        {
+            preprocess_node_params[index++] = rs_input1;
+        }
+        else
+        {
+            preprocess_node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
+        }
+        preprocess_node_params[index++] = (vsi_nn_kernel_node_param_t)tensor_preprocess->t;
+
+        status  = vsi_nn_kernel_node_pass_param( tmp_node, preprocess_node_params,
+            _REPEAT_PREPROCESS_PARAM_NUM );
+        CHECK_STATUS(status);
+        {
+            // Set default border mode.
+            vx_border_t border;
+            border.mode = VX_BORDER_CONSTANT;
+            border.constant_value.U8 = 0;
+            border.constant_value.U16 = 0;
+            border.constant_value.S32 = 0;
+            if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
+            {
+                border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+            }
+            status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) );
+            CHECK_STATUS(status);
+        }
+    }
+
+    // repeat
+    node = vsi_nn_kernel_create_node( graph, kernel );
+    if (node)
+    {
+        uint32_t index = 0;
+        if (rs_input)
+        {
+            node_params[index++] = rs_input;
+        }
+        else
+        {
+            node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[0]->t;
+        }
+        if (rs_input1)
+        {
+            node_params[index++] = rs_input1;
+        }
+        else
+        {
+            node_params[index++] = (vsi_nn_kernel_node_param_t)inputs[1]->t;
+        }
+        node_params[index++] = (vsi_nn_kernel_node_param_t)tensor_preprocess->t;
+        if (rs_output)
+        {
+            node_params[index++] = rs_output;
+        }
+        else
+        {
+            node_params[index++] = (vsi_nn_kernel_node_param_t)outputs[0]->t;
+        }
+        node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
+
+        status  = vsi_nn_kernel_node_pass_param( node, node_params,
+            _REPEAT_PARAM_NUM );
+        CHECK_STATUS(status);
+        vsi_nn_kernel_scalar_release( &node_params[4] );
+        {
+            // Set default border mode.
+            vx_border_t border;
+            border.mode = VX_BORDER_REPLICATE;
+            status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
+            CHECK_STATUS(status);
+        }
+    }
+
+    /* Pass parameters to node. */
+final:
+    if (rs_input)
+    {
+        vsi_nn_kernel_tensor_release( &rs_input );
+    }
+    if (rs_input1)
+    {
+        vsi_nn_kernel_tensor_release( &rs_input1 );
+    }
+    if (rs_output)
+    {
+        vsi_nn_kernel_tensor_release( &rs_output );
+    }
+    if ( kernel_preprocess )
+    {
+        vsi_nn_kernel_release( &kernel_preprocess );
+    }
+    if ( tensor_preprocess )
+    {
+        vsi_nn_ReleaseTensor( &tensor_preprocess );
+    }
+    if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );}
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( repeat, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
index af3e06f..194fb3b 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
@@ -49,11 +49,13 @@ typedef enum
     UP,
     UP_OPT,
     UP_2X_HALF,
+    UP_3X_HALF,
+    UP_4X_HALF,
 } _internal_scale_e;
 
 #define _RESIZE_BILINEAR_KERNEL_SOURCE(_input_type)      "resize_bilinear_"#_input_type
 #define _RESIZE_BILINEAR_KERNEL_SOURCE_OPT(_input_type)  "resize_bilinear_"#_input_type"_opt"
-#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_2X(_input_type)  "resize_bilinear_"#_input_type"_UP_2X"
+#define _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(_input_type)  "resize_bilinear_"#_input_type"_half_pixel_centers"
 
 #define STR(a) #a
 // Add kernel hashtable here
@@ -77,8 +79,21 @@ typedef enum
 
 #define PACK_KERNEL_MAP_UP_2X_HALF( IN_DTYPE, OUT_DTYPE ) \
         { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_2X_HALF ), \
-          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_UP_2X_half"), \
-          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_2X(IN_DTYPE) }
+          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
+            "_SAME_2x_upsample_half_pixel_centers"), \
+          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) }
+
+#define PACK_KERNEL_MAP_UP_4X_HALF( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_4X_HALF ), \
+          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
+            "_SAME_4x_upsample_half_pixel_centers"), \
+          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) }
+
+#define PACK_KERNEL_MAP_UP_3X_HALF( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_BILINEAR_HASH_KEY( IN_DTYPE, OUT_DTYPE, UP_3X_HALF ), \
+          CVIVANTE_NAMESPACE("evis.resize_bilinear_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
+            "_SAME_3x_upsample_half_pixel_centers"), \
+          _RESIZE_BILINEAR_KERNEL_SOURCE_UP_HPC(IN_DTYPE) }
 
 typedef struct
 {
@@ -103,6 +118,8 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] =
     PACK_KERNEL_MAP_UP(BF16, BF16),
     PACK_KERNEL_MAP_UP_OPT(U8, U8),
     PACK_KERNEL_MAP_UP_2X_HALF(U8, U8),
+    PACK_KERNEL_MAP_UP_3X_HALF(U8, U8),
+    PACK_KERNEL_MAP_UP_4X_HALF(U8, U8),
 };
 
 
@@ -203,8 +220,10 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
     uint32_t    out_height;
     float       half_pixel_value = 0.0f;
     vsi_bool    is_use_scale_kernel = (vsi_bool)(_RESIZE_BILINEAR_PARAM_NUM == param_size);
-    vsi_bool    is_use_2x_up_half_kernel  = FALSE;
-
+    vsi_bool    is_half_pixel_centers     = FALSE;
+    vsi_bool    is_2x_up_kernel  = FALSE;
+    vsi_bool    is_3x_up_kernel  = FALSE;
+    vsi_bool    is_4x_up_kernel  = FALSE;
 
     input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
@@ -254,11 +273,13 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
         half_pixel_value = 0.0f;
     }
 
-    if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)))
+    is_half_pixel_centers = (!align_corners) && (half_pixel_centers);
+
+    if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)) && is_half_pixel_centers)
     {
-        is_use_2x_up_half_kernel = (!align_corners) && (half_pixel_centers);
-        is_use_2x_up_half_kernel = is_use_2x_up_half_kernel && \
-                          (2 * in_width == out_width) && (2 * in_height == out_height);
+        is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
+        is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
+        is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height);
     }
 
     if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
@@ -309,11 +330,17 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
         outputZP     = 0;
     }
 
-    if (is_use_2x_up_half_kernel)
+    if (is_2x_up_kernel || is_4x_up_kernel)
     {
-        gpu_param.global_scale[0] = 8;
+        gpu_param.global_scale[0] = 16;
         gpu_param.global_scale[1] = 1;
     }
+    else if (is_3x_up_kernel)
+    {
+        gpu_param.global_scale[0] = 15;
+        gpu_param.global_scale[1] = 6;
+        gpu_param.global_scale[2] = 1;
+    }
     else
     {
         gpu_param.global_scale[0] = 4;
@@ -321,28 +348,134 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
         gpu_param.global_scale[2] = 1;
     }
 
-    if (is_use_2x_up_half_kernel)
+    if (is_2x_up_kernel)
     {
-        gpu_dp_inst_t uniResize2xUp_4x8 = {{
+        gpu_dp_inst_t uniResize2xUp_0_4x8 = {{
             0x55555555, 0x55555555, // TCfg
             0x4418c020, 0x29444194, 0xc8629c86, 0x83a4c839, 0xad0a4a4c, // BinSelect
-            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000704, // AccumType, ConstantType, and PostShift
             0x09030301, 0x03090103, 0x09030301, 0x03090103,
             0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
         }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniResize2xUpRound_2x8 = {{
-            0x55555555, // TCfg
-            0x44444444, // ASelt
-            0x03020100, 0x07060504, // ABin
-            0xaaaaaaaa, // BSelt
-            0x00000000, 0x00000000, // BBin
+        gpu_dp_inst_t uniResize2xUp_1_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x4c5ad0a4, 0x6b54c5b5, 0xd8e6bd8e, 0x07c5d07b, 0xce128c5d, // BinSelect
             0x00000704, // AccumType, ConstantType, and PostShift
-            0x00010001, 0x00010001, 0x00010001, 0x00010001,
-            0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
+            0x09030301, 0x03090103, 0x09030301, 0x03090103,
+            0x09030301, 0x03090103, 0x09030301, 0x03090103 // Constant
         }, GPU_DP_TYPE_16};
 
-        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_4x8", &uniResize2xUp_4x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUpRound_2x8", &uniResize2xUpRound_2x8);
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_0_4x8", &uniResize2xUp_0_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize2xUp_1_4x8", &uniResize2xUp_1_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    else if (is_3x_up_kernel)
+    {
+        gpu_dp_inst_t uniResize3xUp_l00_2x8 = {{
+            0x15515515, // TCfg
+            0x00000000, // ASelt
+            0x21210110, 0x03323202, // ABin
+            0x2aa2aa2a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000610, // AccumType, ConstantType, and PostShift
+            0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
+            0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize3xUp_l01_2x8 = {{
+            0x05155155, // TCfg
+            0x00000000, // ASelt
+            0x54044343, 0x00650554, // ABin
+            0x0a2aa2aa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000610, // AccumType, ConstantType, and PostShift
+            0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa,
+            0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize3xUp_l10_4x4 = {{
+            0x55551155, // TCfg
+            0x50501050, // ASelt
+            0x01011010, 0x21212121, // ABin
+            0xaaaa22aa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x0000060f, // AccumType, ConstantType, and PostShift
+            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
+            0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize3xUp_l11_4x4 = {{
+            0x11555511, // TCfg
+            0x10505010, // ASelt
+            0x32320202, 0x03033232, // ABin
+            0x22aaaa22, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x0000060f, // AccumType, ConstantType, and PostShift
+            0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72,
+            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize3xUp_l12_4x4 = {{
+            0x55115555, // TCfg
+            0x50105050, // ASelt
+            0x43434343, 0x54540404, // ABin
+            0xaa22aaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x0000060f, // AccumType, ConstantType, and PostShift
+            0x1c7238e4, 0x0e391c72, 0x38e41c72, 0x1c720e39,
+            0x00005556, 0x00002aab, 0x1c7238e4, 0x0e391c72 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize3xUp_l13_4x4 = {{
+            0x00551155, // TCfg
+            0x00501050, // ASelt
+            0x05055454, 0x00006565, // ABin
+            0x00aa22aa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x0000060f, // AccumType, ConstantType, and PostShift
+            0x38e41c72, 0x1c720e39, 0x00005556, 0x00002aab,
+            0x1c7238e4, 0x0e391c72, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l00_2x8", &uniResize3xUp_l00_2x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l01_2x8", &uniResize3xUp_l01_2x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l10_4x4", &uniResize3xUp_l10_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l11_4x4", &uniResize3xUp_l11_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l12_4x4", &uniResize3xUp_l12_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize3xUp_l13_4x4", &uniResize3xUp_l13_4x4);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    else if (is_4x_up_kernel)
+    {
+        gpu_dp_inst_t uniResize4xUp_l00_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
+            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize4xUp_l01_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f,
+            0x190f0f09, 0x23051503, 0x05230315, 0x0f19090f // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize4xUp_l10_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x0208c020, 0x1944418c, 0x44419444, 0x62944419, 0x9c8629c8, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x23150503, 0x31070701, 0x07310107, 0x15230305,
+            0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize4xUp_l11_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x8629c862, 0x3a4c839c, 0x4c83a4c8, 0xa4a4c83a, 0xad0a4ad0, // BinSelect
+            0x00000406, // AccumType, ConstantType, and PostShift
+            0x23150503, 0x31070701, 0x07310107, 0x15230305,
+            0x23150503, 0x31070701, 0x07310107, 0x15230305 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l00_4x8", &uniResize4xUp_l00_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l01_4x8", &uniResize4xUp_l01_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l10_4x8", &uniResize4xUp_l10_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize4xUp_l11_4x8", &uniResize4xUp_l11_4x8);
         status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
@@ -832,13 +965,13 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
         goto final;
     }
 
-    if (!is_use_2x_up_half_kernel)
+    if (!is_2x_up_kernel && !is_3x_up_kernel && !is_4x_up_kernel)
     {
         status  = vsi_nn_kernel_gpu_add_param( node, "half_pixel_value", &half_pixel_value);
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
 
-    if (is_use_2x_up_half_kernel)
+    if (is_2x_up_kernel || is_4x_up_kernel)
     {
         gpu_param.global_size[0]   = gpu_align_p2((out_width  + \
                                      gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
@@ -860,8 +993,6 @@ final:
     return status;
 } /* _resize_bilinear_initializer() */
 
-
-
 /*
  * Query kernel
  */
@@ -872,7 +1003,8 @@ static vsi_status _query_kernel
     vsi_nn_tensor_t * const * const outputs,
     vsi_bool is_same_type,
     vsi_bool is_evis2,
-    vsi_bool is_2x_up_half,
+    int32_t align_corners,
+    int32_t half_pixel_centers,
     vsi_bool *is_run_opt_kernel
     )
 {
@@ -886,17 +1018,35 @@ static vsi_status _query_kernel
     vx_kernel_initialize_f  initializer = _resize_bilinear_initializer;
     uint32_t key;
     uint32_t i;
+    vsi_bool is_2x_upsample =(2 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
+                    && (2 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
+    vsi_bool is_3x_upsample =(3 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
+                    && (3 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
+    vsi_bool is_4x_upsample =(4 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
+                    && (4 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
     _internal_scale_e scale_flag = UP;
 
     in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
+    is_2x_upsample &= (in_dtype == U8);
+    is_3x_upsample &= (in_dtype == U8);
+    is_4x_upsample &= (in_dtype == U8);
+
     if (outputs[0]->attr.size[0] > inputs[0]->attr.size[0])
     {
-        if (is_2x_up_half)
+        if (is_same_type && (!align_corners) && (half_pixel_centers) && is_2x_upsample)
         {
             scale_flag = UP_2X_HALF;
         }
+        else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_3x_upsample)
+        {
+            scale_flag = UP_3X_HALF;
+        }
+        else if (is_same_type && (!align_corners) && (half_pixel_centers) && is_4x_upsample)
+        {
+            scale_flag = UP_4X_HALF;
+        }
         else if (is_same_type && is_evis2)
         {
             scale_flag = UP_OPT;
@@ -920,19 +1070,6 @@ static vsi_status _query_kernel
         }
     }
 
-    if ((UP_2X_HALF == scale_flag) && (i >= kernel_map_size) && is_same_type && is_evis2)
-    {
-        scale_flag = UP_OPT;
-        key = RESIZE_BILINEAR_HASH_KEY( in_dtype, out_dtype, scale_flag );
-        for( i = 0; i < (uint32_t)kernel_map_size; i ++ )
-        {
-            if( kernel_map[i].key == key )
-            {
-                break;
-            }
-        }
-    }
-
     if ((UP_OPT == scale_flag) && (i >= kernel_map_size))
     {
         scale_flag = UP;
@@ -1109,9 +1246,6 @@ OnError:
     return scale;
 }
 
-
-
-
 static vsi_nn_kernel_node_t _setup
     (
     vsi_nn_graph_t              * graph,
@@ -1131,14 +1265,10 @@ static vsi_nn_kernel_node_t _setup
     vsi_bool is_same_type       = vsi_nn_is_same_type(inputs[0], outputs[0]);
     vsi_bool is_evis2           = (vsi_bool)(graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_2);
     vsi_bool is_run_opt_kernel  = FALSE;
-    vsi_bool is_2x_up_half      = FALSE;
     vsi_nn_tensor_t*  scale     = NULL;
 
-    is_2x_up_half = is_same_type && (!align_corners) && (half_pixel_centers);
-    is_2x_up_half = is_2x_up_half && (2 * inputs[0]->attr.size[0] == outputs[0]->attr.size[0]) \
-                    && (2 * inputs[0]->attr.size[1] == outputs[0]->attr.size[1]);
     status = _query_kernel( kernel, inputs, outputs, is_same_type, is_evis2,
-                            is_2x_up_half, &is_run_opt_kernel);
+                            align_corners, half_pixel_centers, &is_run_opt_kernel);
     if( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
diff --git a/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c b/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c
new file mode 100644
index 0000000..07c7266
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c
@@ -0,0 +1,393 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+#define KERNEL_NAME_SEQUENCE_MASK_U8TOU8             CVIVANTE_NAMESPACE("evis.sequence_mask_U8toU8")
+#define KERNEL_NAME_SEQUENCE_MASK_U8TOU8_2D          CVIVANTE_NAMESPACE("evis.sequence_mask_U8toU8_2D")
+#define KERNEL_NAME_SEQUENCE_MASK_I8TOI8             CVIVANTE_NAMESPACE("evis.sequence_mask_I8toI8")
+#define KERNEL_NAME_SEQUENCE_MASK_I8TOI8_2D          CVIVANTE_NAMESPACE("evis.sequence_mask_I8toI8_2D")
+#define KERNEL_NAME_SEQUENCE_MASK_I8TOU8             CVIVANTE_NAMESPACE("evis.sequence_mask_I8toU8")
+#define KERNEL_NAME_SEQUENCE_MASK_I8TOU8_2D          CVIVANTE_NAMESPACE("evis.sequence_mask_I8toU8_2D")
+#define KERNEL_NAME_SEQUENCE_MASK_I16TOI16           CVIVANTE_NAMESPACE("evis.sequence_mask_I16toI16")
+#define KERNEL_NAME_SEQUENCE_MASK_I16TOI16_2D        CVIVANTE_NAMESPACE("evis.sequence_mask_I16toI16_2D")
+#define KERNEL_NAME_SEQUENCE_MASK_I16TOU8            CVIVANTE_NAMESPACE("evis.sequence_mask_I16toU8")
+#define KERNEL_NAME_SEQUENCE_MASK_I16TOU8_2D         CVIVANTE_NAMESPACE("evis.sequence_mask_I16toU8_2D")
+#define KERNEL_NAME_SEQUENCE_MASK_F16TOF16           CVIVANTE_NAMESPACE("evis.sequence_mask_F16toF16")
+#define KERNEL_NAME_SEQUENCE_MASK_F16TOF16_2D        CVIVANTE_NAMESPACE("evis.sequence_mask_F16toF16_2D")
+#define KERNEL_NAME_SEQUENCE_MASK_F16TOU8            CVIVANTE_NAMESPACE("evis.sequence_mask_F16toU8")
+#define KERNEL_NAME_SEQUENCE_MASK_F16TOU8_2D         CVIVANTE_NAMESPACE("evis.sequence_mask_F16toU8_2D")
+
+#define KERNEL_SOURCE_1    "sequence_mask"
+
+#define HASH_SEQUENCE_MASK_KEY(_input0_type, _output_type, _is2D) \
+    ((_input0_type << 24) | (_output_type << 16) | (_is2D))
+
+#define TENSOR_SEQUENCE_MASK_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_SEQUENCE_MASK_KEY(IN0_TYPE, OUT_TYPE, 0), \
+        KERNEL_NAME_SEQUENCE_MASK_##IN0_TYPE##TO##OUT_TYPE, \
+        SOURCE },
+
+#define TENSOR_SEQUENCE_MASK_2D_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_SEQUENCE_MASK_KEY(IN0_TYPE, OUT_TYPE, 1), \
+        KERNEL_NAME_SEQUENCE_MASK_##IN0_TYPE##TO##OUT_TYPE##_2D, \
+        SOURCE },
+
+static const struct {
+        uint32_t key;
+        char* function_name;
+        const char* source_name;
+    } kernel_map[] =
+{
+    TENSOR_SEQUENCE_MASK_KERNELS(U8, U8,        KERNEL_SOURCE_1)
+    TENSOR_SEQUENCE_MASK_2D_KERNELS(U8, U8,     KERNEL_SOURCE_1)
+    TENSOR_SEQUENCE_MASK_KERNELS(I8, I8,        KERNEL_SOURCE_1)
+    TENSOR_SEQUENCE_MASK_2D_KERNELS(I8, I8,     KERNEL_SOURCE_1)
+    TENSOR_SEQUENCE_MASK_KERNELS(I16, I16,      KERNEL_SOURCE_1)
+    TENSOR_SEQUENCE_MASK_2D_KERNELS(I16, I16,   KERNEL_SOURCE_1)
+    TENSOR_SEQUENCE_MASK_KERNELS(F16, F16,      KERNEL_SOURCE_1)
+    TENSOR_SEQUENCE_MASK_2D_KERNELS(F16, F16,   KERNEL_SOURCE_1)
+    TENSOR_SEQUENCE_MASK_KERNELS(I16, U8,       KERNEL_SOURCE_1)
+    TENSOR_SEQUENCE_MASK_2D_KERNELS(I16, U8,    KERNEL_SOURCE_1)
+    TENSOR_SEQUENCE_MASK_KERNELS(I8, U8,       KERNEL_SOURCE_1)
+    TENSOR_SEQUENCE_MASK_2D_KERNELS(I8, U8,    KERNEL_SOURCE_1)
+    TENSOR_SEQUENCE_MASK_KERNELS(F16, U8,       KERNEL_SOURCE_1)
+    TENSOR_SEQUENCE_MASK_2D_KERNELS(F16, U8,    KERNEL_SOURCE_1)
+};
+
+static vx_param_description_t kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _EVIS_PARAM_NUM          _cnt_of_array(kernel_param_def)
+
+DEF_KERNEL_INITIALIZER(_sequence_mask_initializer)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL, NULL };
+    vsi_int_array_t * out_shape = NULL;
+    float scaleIn = 1.0f;
+    float scaleOut = 1.0f;
+    float outputVal1 = 1.0f;
+    int32_t output_zp = 0;
+    int32_t input_zp = 0;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    out_shape  = attr[1]->shape;
+
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        input_zp     = attr[0]->asymm.zero_point;
+        scaleIn      = attr[0]->asymm.scale;
+    }
+    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    {
+        if (attr[0]->dfp.fl > 0)
+        {
+            scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
+        }
+        else
+        {
+            scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
+        }
+        input_zp = 0;
+    }
+
+    if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
+    {
+        output_zp    = attr[1]->asymm.zero_point;
+        scaleOut     = 1.0f / attr[1]->asymm.scale;
+    }
+    else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP)
+    {
+        if (attr[1]->dfp.fl > 0)
+        {
+            scaleOut = (float)((int64_t)1 << attr[1]->dfp.fl);
+        }
+        else
+        {
+            scaleOut = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl));
+        }
+        output_zp = 0;
+    }
+
+    outputVal1 = scaleOut + (float)output_zp;
+
+    gpu_param.global_scale[0] = 4;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+
+    gpu_param.global_size[0] = gpu_align_p2(
+            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+
+    {
+        gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
+            0x05050505, // TCfg
+            0x04040404, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t UniFP16toFP32Lo4_dp4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        status = vsi_nn_kernel_gpu_add_param( node,
+                        "uniConvert1stUint8SubZpToFp32_4x4", &uniConvert1stUint8SubZpToFp32_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+                        "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+                        "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4 );
+        status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp);
+        status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
+        status |= vsi_nn_kernel_gpu_add_param(node, "output_ZP", &output_zp);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputVal1", &outputVal1);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+
+    return status;
+} /* _sequence_mask_initializer() */
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel,
+    int32_t is2Dflg
+    )
+{
+    vsi_nn_kernel_dtype_e input0_dtype = I32;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    vsi_status status = VSI_FAILURE;
+    uint32_t key = 0;
+    int i = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+    if (output_dtype == BOOL8)
+    {
+        output_dtype= U8;
+    }
+
+    key = HASH_SEQUENCE_MASK_KEY( input0_dtype, output_dtype, is2Dflg);
+
+    for ( i = 0; i < _cnt_of_array(kernel_map); i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(kernel_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters = kernel_param_def;
+        kernel->info.numParams = _cnt_of_array( kernel_param_def );
+        kernel->info.initialize = _sequence_mask_initializer;
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+static int32_t _optimize_mask_shape
+    (
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    int32_t max_len,
+    int32_t* opt_shape_in,
+    int32_t* opt_shape_out,
+    int32_t* is2Dflg
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    int32_t in_shape[VSI_NN_MAX_DIM_NUM] = {0};
+    int32_t new_rank = 0;
+    uint32_t i = 0;
+
+    for(i = 0; i < inputs[0]->attr.dim_num; i++)
+    {
+        in_shape[i] = inputs[0]->attr.size[i];
+    }
+
+    vsi_nn_kernel_optimize_element_shape( in_shape, inputs[0]->attr.dim_num, opt_shape_in, &new_rank );
+    if (new_rank > 2)
+    {
+        return VSI_FAILURE;
+    }
+
+    opt_shape_out[0] = max_len;
+    for(i = 0; i < (uint32_t)new_rank; i++)
+    {
+        opt_shape_out[i + 1] = opt_shape_in[i];
+    }
+    if (opt_shape_out[2] == 1)
+    {
+        is2Dflg[0] = 1;
+    }
+
+    return status;
+}
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t tmp_params[_EVIS_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
+    int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }, { 1, 1, 1, 1 }};
+    int32_t max_len  = vsi_nn_kernel_param_get_int32( params, "max_len" );
+    int32_t is2Dflg = 0;
+
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _optimize_mask_shape(inputs, outputs, max_len, new_shape[0], new_shape[1], &is2Dflg);
+    if ( VSI_SUCCESS != status )
+    {
+        goto final;
+    }
+    rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape[0], 2);
+    rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[1], 4);
+
+    status = _query_kernel( inputs, outputs, kernel, is2Dflg );
+    if ( VSI_SUCCESS == status )
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            uint32_t index = 0;
+            /* Pass parameters to node. */
+            tmp_params[index++] = rs_input;
+            tmp_params[index++] = rs_output;
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &max_len );
+            status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM );
+            CHECK_STATUS(status);
+            vsi_nn_kernel_scalar_release( &tmp_params[2] );
+        }
+    }
+
+final:
+    if (rs_input)
+    {
+        vsi_nn_kernel_tensor_release( &rs_input );
+    }
+    if (rs_output)
+    {
+        vsi_nn_kernel_tensor_release( &rs_output );
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( sequence_mask, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/evis/slice_evis.c b/src/tim/vx/internal/src/kernel/evis/slice_evis.c
new file mode 100644
index 0000000..35b8b99
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/slice_evis.c
@@ -0,0 +1,451 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+
+#define _SLICE_KERNEL_SOURCE      "slice"
+#define _SLICE_KERNEL_NAME        CVIVANTE_NAMESPACE("evis.slice")
+
+    // Add kernel hashtable here
+#define SLICE_SH_KERNEL_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
+    CVIVANTE_NAMESPACE("evis.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE)
+
+    // Add kernel hashtable here
+#define SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE , _IMAGE_2D, _SAMEFL) \
+    (( IN1_DTYPE << 18 ) | ( IN0_DTYPE << 10 ) | ( OUT_DTYPE << 2 ) | (_IMAGE_2D << 1) | (_SAMEFL))
+
+#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
+{   SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0, 0 ), \
+    SLICE_SH_KERNEL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
+
+#define SLICE_SH_KERNEL_2D_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
+    CVIVANTE_NAMESPACE("evis.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_2D")
+
+#define PACK_KERNEL_MAP_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
+{   SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1, 0 ), \
+    SLICE_SH_KERNEL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
+
+#define SLICE_SH_KERNEL_SAMEFL_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
+    CVIVANTE_NAMESPACE("evis.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_SAMEFL")
+
+#define PACK_KERNEL_MAP_SAMEFL( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
+{   SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0, 1 ), \
+    SLICE_SH_KERNEL_SAMEFL_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
+
+#define SLICE_SH_KERNEL_SAMEFL_2D_NAME(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
+    CVIVANTE_NAMESPACE("evis.slice_"#IN0_DTYPE"_"#IN1_DTYPE"to"#OUT_DTYPE"_SAMEFL_2D")
+
+#define PACK_KERNEL_MAP_SAMEFL_2D( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, SOURCE ) \
+{   SLICE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1, 1 ), \
+    SLICE_SH_KERNEL_SAMEFL_2D_NAME( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ), SOURCE }
+
+    typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _slice_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( F16, I32, F16, _SLICE_KERNEL_SOURCE ),
+    PACK_KERNEL_MAP( I16, I32, I16, _SLICE_KERNEL_SOURCE ),
+    PACK_KERNEL_MAP( U8, I32,  U8,  _SLICE_KERNEL_SOURCE ),
+    PACK_KERNEL_MAP( I8, I32,  I8,  _SLICE_KERNEL_SOURCE ),
+
+    PACK_KERNEL_MAP_2D( F16, I32, F16, _SLICE_KERNEL_SOURCE ),
+    PACK_KERNEL_MAP_2D( I16, I32, I16, _SLICE_KERNEL_SOURCE ),
+    PACK_KERNEL_MAP_2D( U8, I32,  U8,  _SLICE_KERNEL_SOURCE ),
+    PACK_KERNEL_MAP_2D( I8, I32,  I8,  _SLICE_KERNEL_SOURCE ),
+
+    PACK_KERNEL_MAP_SAMEFL( I16, I32, I16, _SLICE_KERNEL_SOURCE ),
+    PACK_KERNEL_MAP_SAMEFL( U8,  I32, U8, _SLICE_KERNEL_SOURCE ),
+
+    PACK_KERNEL_MAP_SAMEFL_2D( I16, I32, I16, _SLICE_KERNEL_SOURCE ),
+    PACK_KERNEL_MAP_SAMEFL_2D( U8, I32,  U8,  _SLICE_KERNEL_SOURCE ),
+};
+
+#define _INPUT_NUM          (2)
+#define _OUTPUT_NUM         (1)
+#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
+/*
+* Kernel params
+*/
+static vx_param_description_t _slice_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _SLICE_PARAM_NUM  _cnt_of_array( _slice_kernel_param_def )
+#define SCALAR_SAMLEFL_VALUE          (3)
+/*
+* Kernel initializer
+*/
+DEF_KERNEL_INITIALIZER(_slice_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+#define _PACK_SLICE_KEY( IN0_TYPE, OUT_TYPE, SAMLEFL)    \
+    (IN0_TYPE | (OUT_TYPE << 8) | (SAMLEFL << 16))
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+    };
+    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
+    vsi_nn_kernel_tensor_attr_t * input_attr    = NULL;
+    vsi_int_array_t * out_shape                 = NULL;
+    vsi_nn_kernel_dtype_e        input_dtype    = F16;
+    vsi_nn_kernel_dtype_e        output_dtype   = F16;
+    float     scaleIn         = 1.0f;
+    float     scaleOut        = 1.0f;
+    int32_t   output_ZP       = 0;
+    int32_t   input_ZP        = 0;
+    int32_t   srcFixPointPos  = 0;
+    int32_t   dstFixPointPos  = 0;
+    int32_t   is_samefl       = 0;
+    uint32_t  pack_key        = 0;
+
+    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_SAMLEFL_VALUE], &is_samefl);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    out_shape  = output_attr->shape;
+    input_dtype  = input_attr->dtype;
+    output_dtype = output_attr->dtype;
+
+    pack_key = _PACK_SLICE_KEY( input_dtype, output_dtype, is_samefl);
+
+    if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
+    {
+        srcFixPointPos   = input_attr->dfp.fl;
+        if (srcFixPointPos > 0)
+        {
+            scaleIn = (1.0f / ((float) ((int64_t)1 << srcFixPointPos)));
+        }
+        else
+        {
+            scaleIn = ((float) ((int64_t)1 << -srcFixPointPos));
+        }
+    }
+    else if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant)
+    {
+        input_ZP         = input_attr->asymm.zero_point;
+        scaleIn          = input_attr->asymm.scale;
+    }
+
+    if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
+    {
+        dstFixPointPos   = output_attr->dfp.fl;
+        if (dstFixPointPos > 0)
+        {
+            scaleOut = (1.0f / ((float) ((int64_t)1 << dstFixPointPos)));
+        }
+        else
+        {
+            scaleOut = ((float) ((int64_t)1 << -dstFixPointPos));
+        }
+    }
+    else if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant)
+    {
+        output_ZP        = output_attr->asymm.zero_point;
+        scaleOut         = output_attr->asymm.scale;
+    }
+
+    if ((F16 == input_dtype)
+        || (I16 == input_dtype)
+        || (BF16 == input_dtype)
+        )
+    {
+        gpu_param.global_scale[0]  = 8;
+        gpu_param.global_scale[1]  = 1;
+        gpu_param.global_scale[2]  = 1;
+    }
+    else
+    {
+        gpu_param.global_scale[0]  = 16;
+        gpu_param.global_scale[1]  = 1;
+        gpu_param.global_scale[2]  = 1;
+    }
+
+    gpu_param.dim = out_shape->size < 3 ? 2 : 3;
+    gpu_param.global_size[0] = gpu_align_p2(
+        (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+        / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+        (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+        / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+
+    switch (pack_key)
+    {
+    case _PACK_SLICE_KEY(I16, I16, 0):
+    case _PACK_SLICE_KEY(U8,  U8,  0):
+    case _PACK_SLICE_KEY(I8,  I8,  0):
+    case _PACK_SLICE_KEY(I16, F16, 0):
+    case _PACK_SLICE_KEY(U8,  F16, 0):
+    case _PACK_SLICE_KEY(I8,  F16, 0):
+    case _PACK_SLICE_KEY(F16, I16, 0):
+    case _PACK_SLICE_KEY(F16, U8,  0):
+    case _PACK_SLICE_KEY(F16, I8,  0):
+        {
+            float     uint8Scale = scaleIn / scaleOut;
+            uint16_t  M0                   = 0;
+            int32_t   postShift            = 0;
+            uint32_t  multAndoutZP[2]      = {0};
+            gpu_dp_inst_t uniU8MulAndPostShift_Lo_2x8 = {{
+                0xdddddddd, // TCfg
+                0x44444444, // ASelt
+                0x13121110, 0x17161514, // ABin
+                0x11111111, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002600, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniU8MulAndPostShift_Hi_2x8 = {{
+                0xdddddddd, // TCfg
+                0x44444444, // ASelt
+                0x1b1a1918, 0x1f1e1d1c, // ABin
+                0x11111111, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002600, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+
+            gpu_quantize_multiplier_16bit(uint8Scale, &M0, &postShift);
+            multAndoutZP[0] = (uint32_t)(M0);
+            multAndoutZP[1] = (uint32_t)((output_ZP << postShift) - input_ZP * M0);
+
+            uniU8MulAndPostShift_Lo_2x8.data[7] |= (postShift & 0x1F);
+            uniU8MulAndPostShift_Hi_2x8.data[7] |= (postShift & 0x1F);
+
+            status  = vsi_nn_kernel_gpu_add_param( node, "uniU8MulAndPostShift_Lo_2x8", &uniU8MulAndPostShift_Lo_2x8);
+            status |= vsi_nn_kernel_gpu_add_param( node, "uniU8MulAndPostShift_Hi_2x8", &uniU8MulAndPostShift_Hi_2x8);
+            status |= vsi_nn_kernel_gpu_add_param( node, "multAndoutZP", multAndoutZP);
+        }
+        break;
+    default:
+        break;
+    }
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+#undef _PACK_SLICE_KEY
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(output_attr);
+    SAFE_FREE_TENSOR_ATTR(input_attr);
+    return status;
+} /* _slice_initializer() */
+
+static vsi_bool _is_same_quant
+    (
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_dtype_t *src_dtype = NULL,*dst_dtype = NULL;
+
+    src_dtype = &inputs[0]->attr.dtype;
+    dst_dtype = &outputs[0]->attr.dtype;
+
+    if (vsi_nn_DtypeCompare(src_dtype, dst_dtype) == FALSE)
+    {
+        return FALSE;
+    }
+
+    return TRUE;
+} /* _is_same_quant */
+
+/*
+* Query kernel
+*/
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const* const inputs,
+    vsi_nn_tensor_t * const* const outputs,
+    vsi_bool image_2d,
+    vsi_bool is_same_quant
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype;
+    vsi_nn_kernel_dtype_e in1_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _slice_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _slice_kernel_map );
+    vx_param_description_t * param_def  = _slice_kernel_param_def;
+    size_t param_def_size               = _cnt_of_array( _slice_kernel_param_def );
+    vx_kernel_initialize_f  initializer = _slice_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in0_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype  = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (is_same_quant && (F16 == in0_dtype || BF16 == in0_dtype) )
+    {
+        in0_dtype = I16;
+        out_dtype = I16;
+    }
+    else if (is_same_quant && (I8 == in0_dtype || BOOL8 == in0_dtype) )
+    {
+        in0_dtype = U8;
+        out_dtype = U8;
+    }
+
+    key = SLICE_HASH_KEY( in0_dtype, in1_dtype, out_dtype, image_2d, is_same_quant );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_def_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+            "vsi_nn_kernel_header",
+            kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+            kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_SLICE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_bool image_2d = FALSE;
+    uint32_t rank[_IO_NUM] = {0};
+    int32_t  shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
+    vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL };
+    int32_t i = 0;
+    int32_t input_batch = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
+    int32_t output_batch = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1;
+    vsi_bool is_same_quant = FALSE;
+
+    vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num,
+        shapes[0], &rank[0]);
+    vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num,
+        shapes[1], &rank[1]);
+    vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num,
+        shapes[2], &rank[2]);
+
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        reshape_tensors[i] = vsi_nn_reshape_tensor( graph,
+            inputs[i], (uint32_t*)shapes[i], rank[i] );
+    }
+    reshape_tensors[_INPUT_NUM] = vsi_nn_reshape_tensor( graph,
+        outputs[0], (uint32_t*)shapes[_INPUT_NUM], rank[_INPUT_NUM] );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[0]->attr.size,
+        reshape_tensors[0]->attr.dim_num ) || input_batch != output_batch )
+    {
+        return NULL;
+    }
+
+    image_2d = (rank[0] < 3 || shapes[0][2] == 1);
+    is_same_quant = _is_same_quant(inputs, outputs);
+
+    status = _query_kernel( kernel, inputs, outputs , image_2d, is_same_quant );
+    if( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _SLICE_PARAM_NUM,
+                reshape_tensors, input_num, &reshape_tensors[_INPUT_NUM], output_num );
+            node_params[SCALAR_SAMLEFL_VALUE] = vsi_nn_kernel_scalar_create( graph, I32, &is_same_quant );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _SLICE_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SAMLEFL_VALUE] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( slice, _setup )
diff --git a/src/tim/vx/internal/src/kernel/evis/tile_evis.c b/src/tim/vx/internal/src/kernel/evis/tile_evis.c
index a076329..5f3465b 100644
--- a/src/tim/vx/internal/src/kernel/evis/tile_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/tile_evis.c
@@ -41,51 +41,55 @@ __BEGIN_DECLS
 /*
  * Define kernel meta.
  */
-#define HASH_TILE_KEY(_input_type, _output_type, _image_2d, _remainder) \
-    ((_input_type << 18) | (_output_type << 4) | (_image_2d << 3) | (_remainder))
+#define HASH_TILE_KEY(_input_type, _output_type, _image_2d, _is_size1, _remainder) \
+    ((_input_type << 19) | (_output_type << 5) | (_image_2d << 4) | (_is_size1 << 3) | (_remainder))
 
 #define KERNEL_SOURCE    "tile",
 #define KERNEL_SOURCE1   "tile_mix",
 
 #define STR(a) #a
 
+ #define TENSOR_TILE_KEY_DIM0_IS1_2D(SRC_TYPE, OUT_TYPE) \
+    {   HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 1, 1, 1), \
+        CVIVANTE_NAMESPACE("evis.tile_1toN_"#SRC_TYPE"to"#OUT_TYPE"_2D"), \
+        KERNEL_SOURCE },
+
 #define HASH_TILE_SH_KERNEL_NAME(SRC_TYPE, DST_TYPE, REMAINDER) \
     CVIVANTE_NAMESPACE("evis.tile_remain"STR(REMAINDER)"_"#SRC_TYPE"to"#DST_TYPE)
 
-#define TENSOR_TILE_KERNELS(SRC_TYPE, OUT_TYPE, REMAINDER) \
-    {   HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 0, REMAINDER), \
+#define TENSOR_TILE_KERNELS(SRC_TYPE, OUT_TYPE, ISSIZE1, REMAINDER) \
+    {   HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 0, ISSIZE1, REMAINDER), \
         HASH_TILE_SH_KERNEL_NAME(SRC_TYPE, OUT_TYPE, REMAINDER), \
         KERNEL_SOURCE1 },
 
 #define HASH_TILE_SH_KERNEL_2D_NAME(SRC_TYPE, DST_TYPE, REMAINDER) \
     CVIVANTE_NAMESPACE("evis.tile_remain"STR(REMAINDER)"_"#SRC_TYPE"to"#DST_TYPE"_2D")
 
-#define TENSOR_TILE_KERNELS_2D(SRC_TYPE, OUT_TYPE, REMAINDER) \
-    {   HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 1, REMAINDER), \
+#define TENSOR_TILE_KERNELS_2D(SRC_TYPE, OUT_TYPE, ISSIZE1, REMAINDER) \
+    {   HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 1, ISSIZE1, REMAINDER), \
         HASH_TILE_SH_KERNEL_2D_NAME(SRC_TYPE, OUT_TYPE, REMAINDER), \
         KERNEL_SOURCE1 },
 
 #define TENSOR_TILE_8BITS_KERNELS(SRC_TYPE, OUT_TYPE, REMAINDER) \
-    {   HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 0, REMAINDER), \
+    {   HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 0, 0, REMAINDER), \
         HASH_TILE_SH_KERNEL_NAME(U8, U8, REMAINDER), \
         KERNEL_SOURCE },
 
 #define TENSOR_TILE_16BITS_KERNELS(SRC_TYPE, OUT_TYPE, REMAINDER) \
-    {   HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 0, REMAINDER), \
+    {   HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 0, 0, REMAINDER), \
         HASH_TILE_SH_KERNEL_NAME(I16, I16, REMAINDER), \
         KERNEL_SOURCE },
 
  #define TENSOR_TILE_8BITS_2D_KERNELS(SRC_TYPE, OUT_TYPE, REMAINDER) \
-    {   HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 1, REMAINDER), \
+    {   HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 1, 0, REMAINDER), \
         HASH_TILE_SH_KERNEL_2D_NAME(U8, U8, REMAINDER), \
         KERNEL_SOURCE },
 
  #define TENSOR_TILE_16BITS_2D_KERNELS(SRC_TYPE, OUT_TYPE, REMAINDER) \
-    {   HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 1, REMAINDER), \
+    {   HASH_TILE_KEY(SRC_TYPE, OUT_TYPE, 1, 0, REMAINDER), \
         HASH_TILE_SH_KERNEL_2D_NAME(I16, I16, REMAINDER), \
         KERNEL_SOURCE },
 
-
 static const struct {
         uint32_t key;
         char* function_name;
@@ -176,23 +180,44 @@ static const struct {
     TENSOR_TILE_16BITS_2D_KERNELS( BF16, BF16, 6)
     TENSOR_TILE_16BITS_2D_KERNELS( BF16, BF16, 7)
 
-    TENSOR_TILE_KERNELS( U8,   F16, 0)
-    TENSOR_TILE_KERNELS( U8,   F16, 1)
-    TENSOR_TILE_KERNELS( U8,   F16, 2)
-    TENSOR_TILE_KERNELS( U8,   F16, 3)
-    TENSOR_TILE_KERNELS( U8,   F16, 4)
-    TENSOR_TILE_KERNELS( U8,   F16, 5)
-    TENSOR_TILE_KERNELS( U8,   F16, 6)
-    TENSOR_TILE_KERNELS( U8,   F16, 7)
+    TENSOR_TILE_KERNELS( U8,   F16, 0, 0)
+    TENSOR_TILE_KERNELS( U8,   F16, 0, 1)
+    TENSOR_TILE_KERNELS( U8,   F16, 0, 2)
+    TENSOR_TILE_KERNELS( U8,   F16, 0, 3)
+    TENSOR_TILE_KERNELS( U8,   F16, 0, 4)
+    TENSOR_TILE_KERNELS( U8,   F16, 0, 5)
+    TENSOR_TILE_KERNELS( U8,   F16, 0, 6)
+    TENSOR_TILE_KERNELS( U8,   F16, 0, 7)
 
-    TENSOR_TILE_KERNELS_2D( U8,   F16, 0)
-    TENSOR_TILE_KERNELS_2D( U8,   F16, 1)
-    TENSOR_TILE_KERNELS_2D( U8,   F16, 2)
-    TENSOR_TILE_KERNELS_2D( U8,   F16, 3)
-    TENSOR_TILE_KERNELS_2D( U8,   F16, 4)
-    TENSOR_TILE_KERNELS_2D( U8,   F16, 5)
-    TENSOR_TILE_KERNELS_2D( U8,   F16, 6)
-    TENSOR_TILE_KERNELS_2D( U8,   F16, 7)
+    TENSOR_TILE_KERNELS( U8,   F16, 1, 0)
+    TENSOR_TILE_KERNELS( U8,   F16, 1, 1)
+    TENSOR_TILE_KERNELS( U8,   F16, 1, 2)
+    TENSOR_TILE_KERNELS( U8,   F16, 1, 3)
+    TENSOR_TILE_KERNELS( U8,   F16, 1, 4)
+    TENSOR_TILE_KERNELS( U8,   F16, 1, 5)
+    TENSOR_TILE_KERNELS( U8,   F16, 1, 6)
+    TENSOR_TILE_KERNELS( U8,   F16, 1, 7)
+
+    TENSOR_TILE_KERNELS_2D( U8,   F16, 0, 0)
+    TENSOR_TILE_KERNELS_2D( U8,   F16, 0, 1)
+    TENSOR_TILE_KERNELS_2D( U8,   F16, 0, 2)
+    TENSOR_TILE_KERNELS_2D( U8,   F16, 0, 3)
+    TENSOR_TILE_KERNELS_2D( U8,   F16, 0, 4)
+    TENSOR_TILE_KERNELS_2D( U8,   F16, 0, 5)
+    TENSOR_TILE_KERNELS_2D( U8,   F16, 0, 6)
+    TENSOR_TILE_KERNELS_2D( U8,   F16, 0, 7)
+
+    TENSOR_TILE_KERNELS_2D( U8,   F16, 1, 0)
+    TENSOR_TILE_KERNELS_2D( U8,   F16, 1, 1)
+    TENSOR_TILE_KERNELS_2D( U8,   F16, 1, 2)
+    TENSOR_TILE_KERNELS_2D( U8,   F16, 1, 3)
+    TENSOR_TILE_KERNELS_2D( U8,   F16, 1, 4)
+    TENSOR_TILE_KERNELS_2D( U8,   F16, 1, 5)
+    TENSOR_TILE_KERNELS_2D( U8,   F16, 1, 6)
+    TENSOR_TILE_KERNELS_2D( U8,   F16, 1, 7)
+
+    TENSOR_TILE_KEY_DIM0_IS1_2D(U8,  U8)
+    TENSOR_TILE_KEY_DIM0_IS1_2D(I16, I16)
 };
 
 /*
@@ -383,11 +408,24 @@ static vsi_status _query_kernel
     vsi_nn_kernel_dtype_e output_dtype;
     vsi_status status = VSI_FAILURE;
     uint32_t key;
-    int i;
+    int32_t i = 0;
+    int32_t dim0_size1 = inputs[0]->attr.size[0] == 1 ? 1 : 0;
 
     input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
-    key = HASH_TILE_KEY( input_dtype, output_dtype, image_2d, remainder);
+
+    if (input_dtype == output_dtype && image_2d == TRUE && dim0_size1)
+    {
+        input_dtype = input_dtype == I8 ? U8 : input_dtype;
+        input_dtype = input_dtype == F16 ? I16 : input_dtype;
+        input_dtype = input_dtype == BF16 ? I16 : input_dtype;
+        output_dtype = input_dtype;
+        key = HASH_TILE_KEY(input_dtype, output_dtype, 1, 1, 1);
+    }
+    else
+    {
+        key = HASH_TILE_KEY( input_dtype, output_dtype, image_2d, dim0_size1, remainder);
+    }
 
     for( i = 0; i < _cnt_of_array(_tile_evis_kernel_map); i ++ )
     {
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c
index 12adee6..20b4589 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_param.c
@@ -42,6 +42,7 @@ typedef enum
     _PARAM_I64,
     _PARAM_F32,
     _PARAM_BUFFER,
+    _PARAM_CONST_BUFFER,
     _PARAM_STR,
 } _param_dtype_e;
 
@@ -54,6 +55,7 @@ typedef struct
         int64_t     int64;
         float       float32;
         void*       buffer;
+        const void* const_buffer;
         const char* str;
     } value;
     size_t size;
@@ -164,6 +166,45 @@ void* vsi_nn_kernel_param_get_buffer
     return p->value.buffer;
 } /* vsi_nn_kernel_param_get_buffer() */
 
+vsi_bool vsi_nn_kernel_param_add_const_buffer
+    (
+    vsi_nn_kernel_param_t * params,
+    const char * key,
+    const void * value,
+    size_t size
+    )
+{
+    _param_type* p;
+    CHECK_PARAM_NULL( params, FALSE, "Params is null ptr." );
+    CHECK_PARAM_NULL( key, FALSE, "Param key is null ptr." );
+    p = malloc( sizeof(_param_type) );
+    CHECK_PARAM_NULL( p, FALSE, "Out of memory, add param fail." );
+    p->type = _PARAM_CONST_BUFFER;
+    p->value.const_buffer = value;
+    p->size = size;
+    vsi_nn_hashmap_add( params, key, p );
+    return TRUE;
+} /* vsi_nn_kernel_param_add_const_buffer() */
+
+const void* vsi_nn_kernel_param_get_const_buffer
+    ( const vsi_nn_kernel_param_t * params, const char * key, size_t * size)
+{
+    _param_type* p;
+    CHECK_PARAM_NULL( params, FALSE, "Params is null ptr." );
+    CHECK_PARAM_NULL( key, FALSE, "Param key is null ptr." );
+    p = vsi_nn_hashmap_get( params, key );
+    CHECK_PARAM_NULL( p, 0, "Key %s not in params.", key );
+    if( p->type != _PARAM_CONST_BUFFER )
+    {
+        VSILOGW("Key %s is not \"const buffer\"", key );
+    }
+    if( size != NULL )
+    {
+        *size = p->size;
+    }
+    return p->value.const_buffer;
+} /* vsi_nn_kernel_param_get_const_buffer() */
+
 vsi_nn_kernel_param_t* vsi_nn_kernel_param_create()
 {
     return (vsi_nn_kernel_param_t*)vsi_nn_hashmap_create();
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
index 3b446b8..fd4d2e7 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
@@ -57,12 +57,27 @@ KERNEL_SELECTOR( depthwise_conv1d )
     vsi_nn_kernel_selector_t    * selector
     )
 {
+    int32_t dilation   = vsi_nn_kernel_param_get_int32( params, "dilation" );
+    int32_t kernel = inputs[1]->attr.size[0];
+    int32_t real_kernel = 0;
+    int32_t stride = vsi_nn_kernel_param_get_int32( params, "stride" );
     vsi_nn_kernel_pirority_t pirority[] = {
         { VSI_NN_KERNEL_TYPE_VX,    0 },
         { VSI_NN_KERNEL_TYPE_EVIS,  3 },
         { VSI_NN_KERNEL_TYPE_CL,    2 },
         { VSI_NN_KERNEL_TYPE_CPU,   1 },
         };
+    dilation = dilation == 0 ? 0 : dilation - 1;
+    real_kernel = (kernel - 1) * dilation + kernel;
+
+    if (real_kernel < 16 && stride < 3)
+    {
+        pirority[0].fps = 3;
+        pirority[1].fps = 2;
+        pirority[2].fps = 1;
+        pirority[3].fps = 0;
+    }
+
     return vsi_nn_kernel_pirority_set( selector, pirority, _cnt_of_array(pirority) );
 } /* depthwise_conv1d */
 
@@ -111,5 +126,6 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(mish)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(hard_sigmoid)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(clip)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(relu_keras)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(erf)
 
 __END_DECLS
diff --git a/src/tim/vx/internal/src/kernel/vx/erf_vx.c b/src/tim/vx/internal/src/kernel/vx/erf_vx.c
new file mode 100644
index 0000000..8daf0be
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/vx/erf_vx.c
@@ -0,0 +1,216 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include <float.h>
+#include "utils/vsi_nn_dtype_util_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+typedef struct _sort_lut_s
+{
+    float index;
+    float val;
+} sort_lut;
+
+static float erf_eval(float x)
+{
+    float res = 0;
+    float tmp = x;
+    float factorial = 1; /*n!*/
+    float x_pow = x;
+    int32_t one = 1;
+    int32_t n = 1;
+
+    while (vsi_abs(tmp) > 1e-5)
+    {
+        res += tmp;
+
+        factorial *= n;
+        one *= -1;
+        x_pow *= x * x;
+        tmp = one / factorial * x_pow / ( 2 * n + 1);
+
+        n ++;
+    }
+#define MUL2_RSQRTPI    (1.1283791670955126f)
+
+    res *= MUL2_RSQRTPI;
+
+    return res;
+}
+
+#ifdef VX_USER_LOOKUP_TABLE_SUPPORT
+static int32_t _lut_comparator(const void *pa, const void *pb)
+{
+    sort_lut a = *(sort_lut *)pa;
+    sort_lut b = *(sort_lut *)pb;
+    float diff = a.index - b.index;
+    if ( diff > 0 )
+    {
+        return 1;
+    }
+    else if ( diff < 0 )
+    {
+        return -1;
+    }
+
+    return 0;
+}
+
+static void _set_table_lookup(float func(float), float *index, float *value)
+{
+#define VSI_NN_MAX_LUT_SIZE     (1024)
+#define FLT16_MAX               (57344)
+#define FLT16_MIN               (-57344)
+    uint32_t i = 0;
+    sort_lut *lut = (sort_lut *)calloc(VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut));
+
+    for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++)
+    {
+        int16_t val = (int16_t)(i << 6);
+        lut[i].index = fp16_to_fp32(val);
+        lut[i].val = func(lut[i].index);
+    }
+
+    for (i = 0x0; i < 0x10; i++)
+    {
+        lut[i].index = 0;
+        lut[i].val = func(lut[i].index);
+    }
+
+    for (i = 0x1F0; i < 0x200; i++)
+    {
+        lut[i].index = FLT16_MAX;
+        lut[i].val = func(lut[i].index);
+    }
+
+    for (i = 0x3F0; i < 0x400; i++)
+    {
+        lut[i].index = FLT16_MIN;
+        lut[i].val = func(lut[i].index);
+    }
+
+    qsort(lut, VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut), _lut_comparator);
+
+    for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++)
+    {
+        index[i] = lut[i].index;
+        value[i] = lut[i].val;
+    }
+
+    vsi_nn_safe_free(lut);
+
+#undef VSI_NN_MAX_LUT_SIZE
+#undef FLT16_MIN
+#undef FLT16_MAX
+}
+#endif
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel,
+    float                      func(float)
+    )
+{
+#ifdef VX_USER_LOOKUP_TABLE_SUPPORT
+    vx_lut lut1 = NULL;
+    vx_lut lut2 = NULL;
+    vx_node node = NULL;
+    float index[1024] = {0};
+    float value[1024] = {0};
+
+    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32   ||
+         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 )
+    {
+        return NULL;
+    }
+
+    _set_table_lookup(func, index, value);
+
+    lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024);
+    lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024);
+    if( NULL == lut1 || NULL == lut2 )
+    {
+        VSILOGE("create lut object fail.");
+        goto OnError;
+    }
+
+    vxCopyLUT(lut1, (void*)&index, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
+    vxCopyLUT(lut2, (void*)&value, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
+
+    node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t);
+    if( NULL == node )
+    {
+        VSILOGE("Call vxTensorTableLookupLayer fail.");
+        goto OnError;
+    }
+
+OnError:
+    if (lut1)
+    {
+        vxReleaseLUT(&lut1);
+        lut1 = NULL;
+    }
+    if (lut2)
+    {
+        vxReleaseLUT(&lut2);
+        lut2 = NULL;
+    }
+    return (vsi_nn_kernel_node_t)node;
+#else
+    return NULL;
+#endif
+} /* _setup() */
+
+#define REGISTER_CLIP_OPENVX_KERNEL(KERNEL_NAME, UNARY_FUNC) \
+    static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num, \
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ) \
+    { \
+        return _setup(graph, inputs, input_num, outputs, output_num, \
+                params, kernel, UNARY_FUNC); \
+    } \
+    REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup )
+
+REGISTER_CLIP_OPENVX_KERNEL( erf, erf_eval )
+
+#undef REGISTER_CLIP_OPENVX_KERNEL
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl
index cd255bb..9a322a5 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl
@@ -1,6 +1,62 @@
 #pragma OPENCL EXTENSION CL_VIV_asm : enable
 #pragma OPENCL EXTENSION cl_viv_vx_extension : enable
 
+typedef struct Image
+{
+    __global uchar *ptr;
+    int             stride_x;
+    int             stride_y;
+} Image;
+
+inline uchar* get_image_ptr_from_coord(Image img, int2 coord)
+{
+    return img.ptr + coord.x * img.stride_x + coord.y * img.stride_y;
+}
+
+inline Image create_image_from_image2d(image2d_t input, int stride_x)
+{
+    int8 desc;
+    _viv_asm(COPY, desc, input, sizeof(desc));
+
+    Image img =
+    {
+        .ptr                           = (uchar*)desc.s0,
+        .stride_x                      = stride_x,
+        .stride_y                      = desc.s1
+    };
+
+    return img;
+}
+
+typedef struct Tensor
+{
+    __global uchar *ptr;
+    int             stride_x;
+    int             stride_y;
+    int             stride_z;
+} Tensor;
+
+inline uchar* create_tensor_ptr_from_coord(Tensor t, int4 coord)
+{
+    return t.ptr + coord.x * t.stride_x + coord.y * t.stride_y + coord.z * t.stride_z;
+}
+
+inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride_x)
+{
+    int8 desc;
+    _viv_asm(COPY, desc, input, sizeof(desc));
+
+    Tensor t =
+    {
+        .ptr                           = (uchar*)desc.s0,
+        .stride_x                      = stride_x,
+        .stride_y                      = desc.s1,
+        .stride_z                      = desc.s4
+    };
+
+    return t;
+}
+
 #define readImage2DArray(Dest, Image, Coord)         \
     do {                                                       \
        int8 desc;                                              \
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl
index 68febfb..5b23144 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl
@@ -64,6 +64,11 @@ float4 eltwise_unary_mish(float4 x, float alpha)
     return x;
 }
 
+float4 eltwise_unary_round(float4 x, float alpha)
+{
+    return convert_float4(convert_int4_rte(x));
+}
+
 #define ELTWISE_UNARY_F32(func_name) \
 __kernel void func_name##_F32toF32 \
     ( \
@@ -91,6 +96,7 @@ ELTWISE_UNARY_F32(elu)
 ELTWISE_UNARY_F32(neg)
 ELTWISE_UNARY_F32(mish)
 ELTWISE_UNARY_F32(hard_sigmoid)
+ELTWISE_UNARY_F32(round)
 
 #define ELTWISE_UNARY_F32_2D(func_name) \
 __kernel void func_name##_F32toF32_2D \
@@ -119,6 +125,7 @@ ELTWISE_UNARY_F32_2D(elu)
 ELTWISE_UNARY_F32_2D(neg)
 ELTWISE_UNARY_F32_2D(mish)
 ELTWISE_UNARY_F32_2D(hard_sigmoid)
+ELTWISE_UNARY_F32_2D(round)
 
 #define ELTWISE_UNARY_U8(func_name) \
 __kernel void func_name##_U8toU8 \
@@ -149,6 +156,7 @@ ELTWISE_UNARY_U8(elu)
 ELTWISE_UNARY_U8(neg)
 ELTWISE_UNARY_U8(mish)
 ELTWISE_UNARY_U8(hard_sigmoid)
+ELTWISE_UNARY_U8(round)
 
 #define ELTWISE_UNARY_U8_2D(func_name) \
 __kernel void func_name##_U8toU8_2D \
@@ -179,7 +187,7 @@ ELTWISE_UNARY_U8_2D(elu)
 ELTWISE_UNARY_U8_2D(neg)
 ELTWISE_UNARY_U8_2D(mish)
 ELTWISE_UNARY_U8_2D(hard_sigmoid)
-
+ELTWISE_UNARY_U8_2D(round)
 
 __kernel void neg_I32toI32
     (
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/erf.cl b/src/tim/vx/internal/src/libnnext/ops/cl/erf.cl
new file mode 100644
index 0000000..9f38f95
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/erf.cl
@@ -0,0 +1,113 @@
+#define MUL2_RSQRTPI    (1.1283791670955126f)
+float eltwise_unary_erf(float x)
+{
+    float res = 0;
+    float tmp = x;
+    float factorial = 1;
+    float x_pow = x;
+    float one = 1.0f;
+    float n = 1;
+
+    while (fabs(tmp) > 1e-5)
+    {
+        res += tmp;
+
+        factorial *= n;
+        one *= -1;
+        x_pow *= x * x;
+        tmp = one / factorial * x_pow / ( 2 * n + 1);
+
+        n += 1.0f;
+    }
+    return res * MUL2_RSQRTPI;
+}
+
+#define ELTWISE_UNARY_F32(func_name) \
+__kernel void func_name##_F32toF32 \
+    ( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+                 float           inputScale, \
+                 float           inputTail, \
+                 float           outputScale, \
+                 float           outputZP \
+    ) \
+{ \
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    float4 src = read_imagef(input, coord); \
+ \
+    float4 dst = 0; \
+    dst.x = eltwise_unary_##func_name(src.x); \
+ \
+    write_imagef(output, coord, dst); \
+}
+ELTWISE_UNARY_F32(erf)
+
+#define ELTWISE_UNARY_F32_2D(func_name) \
+__kernel void func_name##_F32toF32_2D \
+    ( \
+    __read_only  image2d_t input, \
+    __write_only image2d_t output, \
+                 float     inputScale, \
+                 float     inputTail, \
+                 float     outputScale, \
+                 float     outputZP \
+    ) \
+{ \
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    float4 src = read_imagef(input, coord); \
+ \
+    float4 dst = 0; \
+    dst.x = eltwise_unary_##func_name(src.x); \
+ \
+    write_imagef(output, coord, dst); \
+}
+ELTWISE_UNARY_F32_2D(erf)
+
+#define ELTWISE_UNARY_U8(func_name) \
+__kernel void func_name##_U8toU8 \
+    ( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+                 float           inputScale, \
+                 float           inputTail, \
+                 float           outputScale, \
+                 float           outputZP \
+    ) \
+{ \
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    uint4 src = read_imageui(input, coord); \
+    float4 data = convert_float4(src) * inputScale - inputTail; \
+ \
+    data.x = eltwise_unary_##func_name(data.x); \
+    uint4 dst = convert_uint4(data * outputScale + outputZP); \
+ \
+    write_imageui(output, coord, dst); \
+}
+ELTWISE_UNARY_U8(erf)
+
+#define ELTWISE_UNARY_U8_2D(func_name) \
+__kernel void func_name##_U8toU8_2D \
+    ( \
+    __read_only  image2d_t input, \
+    __write_only image2d_t output, \
+                 float     inputScale, \
+                 float     inputTail, \
+                 float     outputScale, \
+                 float     outputZP \
+    ) \
+{ \
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    uint4 src = read_imageui(input, coord); \
+    float4 data = convert_float4(src) * inputScale - inputTail; \
+ \
+    data.x = eltwise_unary_##func_name(data.x); \
+    uint4 dst = convert_uint4(data * outputScale + outputZP); \
+ \
+    write_imageui(output, coord, dst); \
+}
+ELTWISE_UNARY_U8_2D(erf)
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl b/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl
index 2164ea2..581694a 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl
@@ -50,6 +50,44 @@ __kernel void floordiv_I32I32toI32_2D(
     write_imagei(output, coord, dst);
 }
 
+__kernel void floordiv_I32I32toU8(
+    __read_only  image2d_array_t  input,
+    __read_only  image2d_array_t  input1,
+    __write_only image2d_array_t  output,
+                 float            input0Scale,
+                 float            input0Tail,
+                 float            input1Scale,
+                 float            input1Tail,
+                 float            outputScale,
+                 float            outputTail )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 src0;
+    int4 src1;
+    readImage2DArray(src0, input, coord);
+    readImage2DArray(src1, input1, coord);
+    uint4 dst  = convert_uint4(floor(convert_float4(src0) / convert_float4(src1)) * outputScale + outputTail);
+    write_imageui(output, coord, dst);
+}
+
+__kernel void floordiv_I32I32toU8_2D(
+    __read_only  image2d_t  input,
+    __read_only  image2d_t  input1,
+    __write_only image2d_t  output,
+                 float      input0Scale,
+                 float      input0Tail,
+                 float      input1Scale,
+                 float      input1Tail,
+                 float      outputScale,
+                 float      outputTail )
+{
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
+    int4 src0 = read_imagei(input, coord);
+    int4 src1 = read_imagei(input1, coord);
+    uint4 dst  = convert_uint4(floor(convert_float4(src0) / convert_float4(src1)) * outputScale + outputTail);
+    write_imageui(output, coord, dst);
+}
+
 __kernel void floordiv_U8U8toU8(
     __read_only  image2d_array_t  input,
     __read_only  image2d_array_t  input1,
@@ -94,3 +132,49 @@ __kernel void floordiv_U8U8toU8_2D(
     uint4 dst  = convert_uint4(out);
     write_imageui(output, coord, dst);
 }
+
+__kernel void floordiv_U8I32toU8(
+    __read_only  image2d_array_t  input,
+    __read_only  image2d_array_t  input1,
+    __write_only image2d_array_t  output,
+                 float            input0Scale,
+                 float            input0Tail,
+                 float            input1Scale,
+                 float            input1Tail,
+                 float            outputScale,
+                 float            outputTail )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    uint4 src0;
+    int4 src1;
+    float4 in0, in1, out;
+    readImage2DArray(src0, input, coord);
+    readImage2DArray(src1, input1, coord);
+    in0 = convert_float4(src0) * input0Scale + input0Tail;
+    in1 = convert_float4(src1);
+    out = floor(in0 / in1) * outputScale + outputTail;
+    uint4 dst = convert_uint4(out);
+    write_imageui(output, coord, dst);
+}
+
+__kernel void floordiv_U8I32toU8_2D(
+    __read_only  image2d_t  input,
+    __read_only  image2d_t  input1,
+    __write_only image2d_t  output,
+                 float      input0Scale,
+                 float      input0Tail,
+                 float      input1Scale,
+                 float      input1Tail,
+                 float      outputScale,
+                 float      outputTail )
+{
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
+    uint4 src0 = read_imageui(input, coord);
+    int4 src1 = read_imagei(input1, coord);
+    float4 in0, in1, out;
+    in0 = convert_float4(src0) * input0Scale + input0Tail;
+    in1 = convert_float4(src1);
+    out = floor(in0 / in1) * outputScale + outputTail;
+    uint4 dst = convert_uint4(out);
+    write_imageui(output, coord, dst);
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_f32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_f32.cl
new file mode 100644
index 0000000..cfb6014
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_f32.cl
@@ -0,0 +1,248 @@
+__kernel void group_norm_sumsqr_F32(
+    __read_only image2d_array_t   input,
+    __write_only image2d_t  output,
+    float eps,
+    int is2d,
+    float input_zp,
+    float input_scale,
+    int width,
+    int height
+    )
+{
+    int gidx = get_global_id(0);
+    int gidz = get_global_id(1);
+    int lidx = get_local_id(0);
+
+    int4 coord = (int4)(gidx, 0, gidz, 0);
+    float4 data;
+    float sum = 0, sqr = 0;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    if(gidx < width)
+    {
+        for(coord.y = 0; coord.y < height;)
+        {
+            data = read_imagef(input, coord);
+            coord.y++;
+            sum += data.x;
+            sqr += data.x * data.x;
+        }
+    }
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        sum = 0; sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 dst = (float4)(0);
+        dst.x = sum;
+        write_imagef(output, coord_out.xy, dst);
+        coord_out.x++;
+        dst.x = sqr;
+        write_imagef(output, coord_out.xy, dst);
+    }
+}
+
+__kernel void group_norm_sumsqr_F32_2D(
+    __read_only image2d_t   input,
+    __write_only image2d_t  output,
+    float eps,
+    int is2d,
+    float input_zp,
+    float input_scale,
+    int width,
+    int height
+    )
+{
+    int gidx = get_global_id(0);
+    int gidz = get_global_id(1);
+    int lidx = get_local_id(0);
+
+    int2 coord = (int2)(gidx, gidz);
+    float4 data;
+    float sum = 0, sqr = 0;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    if(gidx < width)
+    {
+        data = read_imagef(input, coord);
+        sum = data.x;
+        sqr = data.x * data.x;
+    }
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        sum = 0; sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 dst = (float4)(0);
+        dst.x = sum;
+        write_imagef(output, coord_out.xy, dst);
+        coord_out.x++;
+        dst.x = sqr;
+        write_imagef(output, coord_out.xy, dst);
+    }
+}
+
+__kernel void group_norm_meanvari(
+    __read_only image2d_t   input,
+    __write_only image2d_t  output,
+    float eps,
+    float group_ratio,
+    int group_stride
+    )
+{
+    int gidx = get_global_id(0);
+    int lidx = get_local_id(0);
+    int2 coord = (int2)(gidx, get_global_id(1));
+
+    float2 sum_sqr = (float2)(0);
+    float4 mean_vari = (float4)(0);
+
+    __local float2 lcl_data[16];
+    __local float2 lcl_sum[4];
+
+    for(; coord.x < group_stride;)
+    {
+        mean_vari.x += read_imagef(input, coord).x;
+        coord.x++;
+        mean_vari.y += read_imagef(input, coord).x;
+        coord.x+=63;
+    }
+    lcl_data[lidx] = mean_vari.xy;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(lidx < 4)
+    {
+        float2 tmpSum = (float2)(0);
+        for(int i = lidx; i < 16; i+=4)
+        {
+            tmpSum += lcl_data[i];
+        }
+        lcl_sum[lidx] = tmpSum;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(lidx == 0)
+    {
+        for(int i = 0; i < 4; i++)
+        {
+            sum_sqr += lcl_sum[i];
+        }
+        mean_vari.xy = sum_sqr * group_ratio;
+        mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+        mean_vari.s1 = rsqrt(mean_vari.s1);
+
+        coord.x = 0;
+        write_imagef(output, coord, mean_vari);
+        coord.x++;
+        float4 data;
+        data.x = mean_vari.y;
+        write_imagef(output, coord, data);
+    }
+}
+
+__kernel void group_norm_F32toF32(
+    __read_only image2d_array_t   input,
+    __read_only image2d_t   bias,
+    __read_only image2d_t   scale,
+    __read_only image2d_t   meanVari,
+    __write_only image2d_array_t  output,
+    float eps,
+    int is2d,
+    float input_zp,
+    float input_scale,
+    float output_zp,
+    float output_scale,
+    float rSpaceOrg,
+    int width,
+    int height,
+    int pStride
+    )
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 1);
+
+    float4 gamma = read_imagef(scale, coord_para.xy);
+    float4 beta  = read_imagef(bias, coord_para.xy);
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    mean_vari.y = read_imagef(meanVari, coord_para.wy).x;
+    float4 data = read_imagef(input, coord);
+
+    float scale_vari, bias_val;
+
+    scale_vari = gamma.s0 * mean_vari.s1;
+    bias_val = (beta.s0 - scale_vari * mean_vari.s0);
+
+    float4 dst;
+
+    dst.x = data.x * scale_vari + bias_val;
+    write_imagef(output, coord, dst);
+}
+
+__kernel void group_norm_F32toF32_2D(
+    __read_only image2d_t   input,
+    __read_only image2d_t   bias,
+    __read_only image2d_t   scale,
+    __read_only image2d_t   meanVari,
+    __write_only image2d_t  output,
+    float eps,
+    int is2d,
+    float input_zp,
+    float input_scale,
+    float output_zp,
+    float output_scale,
+    float rSpaceOrg,
+    int width,
+    int height,
+    int pStride
+    )
+{
+    int gidz = get_global_id(1);
+    int2 coord = (int2)(get_global_id(0), gidz);
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 1);
+
+    float4 gamma = read_imagef(scale, coord_para.xy);
+    float4 beta  = read_imagef(bias, coord_para.xy);
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    mean_vari.y = read_imagef(meanVari, coord_para.wy).x;
+    float4 data = read_imagef(input, coord);
+
+    float scale_vari, bias_val;
+
+    scale_vari = gamma.s0 * mean_vari.s1;
+    bias_val = beta.s0 - scale_vari * mean_vari.s0;
+
+    float4 dst;
+
+    dst.x = data.x * scale_vari + bias_val;
+    write_imagef(output, coord, dst);
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_i32.cl b/src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_i32.cl
new file mode 100644
index 0000000..72690c7
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_i32.cl
@@ -0,0 +1,278 @@
+__kernel void group_norm_sumsqr_I32(
+    __read_only image2d_array_t   input,
+    __write_only image2d_t  output,
+    float eps,
+    int is2d,
+    float input_zp,
+    float input_scale,
+    int width,
+    int height
+    )
+{
+    int gidx = get_global_id(0);
+    int gidz = get_global_id(1);
+    int lidx = get_local_id(0);
+
+    int4 coord = (int4)(gidx, 0, gidz, 0);
+    float4 data;
+    float sum = 0, sqr = 0;
+    float tmpSum = 0;
+    float e2InScale = input_scale * input_scale;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    if(gidx < width)
+    {
+        for(coord.y = 0; coord.y < height;)
+        {
+            data = convert_float4(read_imagei(input, coord));
+            coord.y++;
+            tmpSum += data.x;
+            sqr += (data.x * data.x * e2InScale);
+        }
+        sum = tmpSum * input_scale;
+    }
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        sum = 0; sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 dst = (float4)(0);
+        dst.x = sum;
+        write_imagef(output, coord_out.xy, dst);
+        coord_out.x++;
+        dst.x = sqr;
+        write_imagef(output, coord_out.xy, dst);
+    }
+}
+
+__kernel void group_norm_sumsqr_I32_2D(
+    __read_only image2d_t   input,
+    __write_only image2d_t  output,
+    float eps,
+    int is2d,
+    float input_zp,
+    float input_scale,
+    int width,
+    int height
+    )
+{
+    int gidx = get_global_id(0);
+    int gidz = get_global_id(1);
+    int lidx = get_local_id(0);
+
+    int2 coord = (int2)(gidx, gidz);
+    float4 data;
+    float sum = 0, sqr = 0;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    if(gidx < width)
+    {
+        data = convert_float4(read_imagei(input, coord));
+        sum = data.x * input_scale;
+        sqr = sum * sum;
+    }
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        sum = 0; sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 dst = (float4)(0);
+        dst.x = sum;
+        write_imagef(output, coord_out.xy, dst);
+        coord_out.x++;
+        dst.x = sqr;
+        write_imagef(output, coord_out.xy, dst);
+    }
+}
+
+__kernel void group_norm_I32toI32(
+    __read_only image2d_array_t   input,
+    __read_only image2d_t   bias,
+    __read_only image2d_t   scale,
+    __read_only image2d_t   meanVari,
+    __write_only image2d_array_t  output,
+    float eps,
+    int is2d,
+    float input_zp,
+    float input_scale,
+    float output_zp,
+    float output_scale,
+    float rSpaceOrg,
+    int width,
+    int height,
+    int pStride
+    )
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 1);
+
+    float4 gamma = read_imagef(scale, coord_para.xy);
+    float4 beta  = read_imagef(bias, coord_para.xy);
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    mean_vari.y = read_imagef(meanVari, coord_para.wy).x;
+    float4 data = convert_float4(read_imagei(input, coord));
+
+    float scale_vari, bias_val;
+
+    scale_vari = gamma.s0 * mean_vari.s1;
+    float alpha = input_scale * output_scale * scale_vari;
+    bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale;
+
+    int4 dst;
+    float4 norm;
+    norm.x = data.x * alpha + bias_val;
+    dst = convert_int4_rte(norm);
+    write_imagei(output, coord, dst);
+}
+
+__kernel void group_norm_I32toI32_2D(
+    __read_only image2d_t   input,
+    __read_only image2d_t   bias,
+    __read_only image2d_t   scale,
+    __read_only image2d_t   meanVari,
+    __write_only image2d_t  output,
+    float eps,
+    int is2d,
+    float input_zp,
+    float input_scale,
+    float output_zp,
+    float output_scale,
+    float rSpaceOrg,
+    int width,
+    int height,
+    int pStride
+    )
+{
+    int gidz = get_global_id(1);
+    int2 coord = (int2)(get_global_id(0), gidz);
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 1);
+
+    float4 gamma = read_imagef(scale, coord_para.xy);
+    float4 beta  = read_imagef(bias, coord_para.xy);
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    mean_vari.y = read_imagef(meanVari, coord_para.wy).x;
+    float4 data = convert_float4(read_imagei(input, coord));
+
+    float scale_vari, bias_val;
+
+    scale_vari = gamma.s0 * mean_vari.s1;
+    float alpha = input_scale * output_scale * scale_vari;
+    bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale;
+
+    int4 dst;
+    float4 norm;
+    norm.x = data.x * alpha + bias_val;
+    dst = convert_int4_rte(norm);
+    write_imagei(output, coord, dst);
+}
+
+__kernel void group_norm_I32toF32(
+    __read_only image2d_array_t   input,
+    __read_only image2d_t   bias,
+    __read_only image2d_t   scale,
+    __read_only image2d_t   meanVari,
+    __write_only image2d_array_t  output,
+    float eps,
+    int is2d,
+    float input_zp,
+    float input_scale,
+    float output_zp,
+    float output_scale,
+    float rSpaceOrg,
+    int width,
+    int height,
+    int pStride
+    )
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 1);
+
+    float4 gamma = read_imagef(scale, coord_para.xy);
+    float4 beta  = read_imagef(bias, coord_para.xy);
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    mean_vari.y = read_imagef(meanVari, coord_para.wy).x;
+    float4 data = convert_float4(read_imagei(input, coord));
+
+    float scale_vari, bias_val;
+
+    scale_vari = gamma.s0 * mean_vari.s1;
+    float alpha = input_scale * scale_vari;
+    bias_val = (beta.s0 - scale_vari * mean_vari.s0);
+
+    float4 norm;
+    norm.x = data.x * alpha + bias_val;
+    write_imagef(output, coord, norm);
+}
+
+__kernel void group_norm_I32toF32_2D(
+    __read_only image2d_t   input,
+    __read_only image2d_t   bias,
+    __read_only image2d_t   scale,
+    __read_only image2d_t   meanVari,
+    __write_only image2d_t  output,
+    float eps,
+    int is2d,
+    float input_zp,
+    float input_scale,
+    float output_zp,
+    float output_scale,
+    float rSpaceOrg,
+    int width,
+    int height,
+    int pStride
+    )
+{
+    int gidz = get_global_id(1);
+    int2 coord = (int2)(get_global_id(0), gidz);
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 1);
+
+    float4 gamma = read_imagef(scale, coord_para.xy);
+    float4 beta  = read_imagef(bias, coord_para.xy);
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    mean_vari.y = read_imagef(meanVari, coord_para.wy).x;
+    float4 data = convert_float4(read_imagei(input, coord));
+
+    float scale_vari, bias_val;
+
+    scale_vari = gamma.s0 * mean_vari.s1;
+    float alpha = input_scale * scale_vari;
+    bias_val = beta.s0 - scale_vari * mean_vari.s0;
+
+    float4 norm;
+    norm.x = data.x * alpha + bias_val;
+    write_imagef(output, coord, norm);
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_u8.cl b/src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_u8.cl
new file mode 100644
index 0000000..a7ccd60
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/group_normalization_u8.cl
@@ -0,0 +1,287 @@
+__kernel void group_norm_sumsqr_U8(
+    __read_only image2d_array_t   input,
+    __write_only image2d_t  output,
+    float eps,
+    int is2d,
+    float input_zp,
+    float input_scale,
+    int width,
+    int height
+    )
+{
+    int gidx = get_global_id(0);
+    int gidz = get_global_id(1);
+    int lidx = get_local_id(0);
+
+    int4 coord = (int4)(gidx, 0, gidz, 0);
+    float4 data;
+    float sum = 0, sqr = 0;
+    float tmpSum = 0, tmpSqr = 0;
+    float e2InScale = input_scale * input_scale;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    if(gidx < width)
+    {
+        for(coord.y = 0; coord.y < height;)
+        {
+            data = convert_float4(read_imageui(input, coord));
+            coord.y++;
+            tmpSum += data.x;
+            tmpSqr += data.x * data.x;
+        }
+        sqr = (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;
+        sum = (tmpSum - height * input_zp) * input_scale;
+    }
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        sum = 0; sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 dst = (float4)(0);
+        dst.x = sum;
+        write_imagef(output, coord_out.xy, dst);
+        coord_out.x++;
+        dst.x = sqr;
+        write_imagef(output, coord_out.xy, dst);
+    }
+}
+
+__kernel void group_norm_sumsqr_U8_2D(
+    __read_only image2d_t   input,
+    __write_only image2d_t  output,
+    float eps,
+    int is2d,
+    float input_zp,
+    float input_scale,
+    int width,
+    int height
+    )
+{
+    int gidx = get_global_id(0);
+    int gidz = get_global_id(1);
+    int lidx = get_local_id(0);
+
+    int2 coord = (int2)(gidx, gidz);
+    float4 data;
+    float sum = 0, sqr = 0;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    if(gidx < width)
+    {
+        data = convert_float4(read_imageui(input, coord));
+        sum = (data.x - input_zp) * input_scale;
+        sqr = sum * sum;
+    }
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        sum = 0; sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 dst = (float4)(0);
+        dst.x = sum;
+        write_imagef(output, coord_out.xy, dst);
+        coord_out.x++;
+        dst.x = sqr;
+        write_imagef(output, coord_out.xy, dst);
+    }
+}
+
+__kernel void group_norm_U8toU8(
+    __read_only image2d_array_t   input,
+    __read_only image2d_t   bias,
+    __read_only image2d_t   scale,
+    __read_only image2d_t   meanVari,
+    __write_only image2d_array_t  output,
+    float eps,
+    int is2d,
+    float input_zp,
+    float input_scale,
+    float output_zp,
+    float output_scale,
+    float rSpaceOrg,
+    int width,
+    int height,
+    int pStride
+    )
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 1);
+
+    float4 gamma = read_imagef(scale, coord_para.xy);
+    float4 beta  = read_imagef(bias, coord_para.xy);
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    mean_vari.y = read_imagef(meanVari, coord_para.wy).x;
+    float4 data = convert_float4(read_imageui(input, coord));
+
+    float scale_vari, bias_val;
+    float scale_inOut = input_scale * output_scale;
+
+    scale_vari = gamma.s0 * mean_vari.s1;
+    float alpha = scale_inOut * scale_vari;
+    bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;
+
+    uint4 dst;
+    data.x -= input_zp;
+    float4 norm;
+    norm.x = data.x * alpha + bias_val;
+    dst = convert_uint4_rte(norm);
+    write_imageui(output, coord, dst);
+}
+
+__kernel void group_norm_U8toU8_2D(
+    __read_only image2d_t   input,
+    __read_only image2d_t   bias,
+    __read_only image2d_t   scale,
+    __read_only image2d_t   meanVari,
+    __write_only image2d_t  output,
+    float eps,
+    int is2d,
+    float input_zp,
+    float input_scale,
+    float output_zp,
+    float output_scale,
+    float rSpaceOrg,
+    int width,
+    int height,
+    int pStride
+    )
+{
+    int gidz = get_global_id(1);
+    int2 coord = (int2)(get_global_id(0), gidz);
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 1);
+
+    float4 gamma = read_imagef(scale, coord_para.xy);
+    float4 beta  = read_imagef(bias, coord_para.xy);
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    mean_vari.y = read_imagef(meanVari, coord_para.wy).x;
+    float4 data = convert_float4(read_imageui(input, coord));
+
+    float scale_vari, bias_val;
+    float scale_inOut = input_scale * output_scale;
+
+    scale_vari = gamma.s0 * mean_vari.s1;
+    float alpha = scale_inOut * scale_vari;
+    bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;
+
+    uint4 dst;
+    data.x -= input_zp;
+    float4 norm;
+    norm.x = data.x * alpha + bias_val;
+    dst = convert_uint4_rte(norm);
+    write_imageui(output, coord, dst);
+}
+
+__kernel void group_norm_U8toF32(
+    __read_only image2d_array_t   input,
+    __read_only image2d_t   bias,
+    __read_only image2d_t   scale,
+    __read_only image2d_t   meanVari,
+    __write_only image2d_array_t  output,
+    float eps,
+    int is2d,
+    float input_zp,
+    float input_scale,
+    float output_zp,
+    float output_scale,
+    float rSpaceOrg,
+    int width,
+    int height,
+    int pStride
+    )
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 1);
+
+    float4 gamma = read_imagef(scale, coord_para.xy);
+    float4 beta  = read_imagef(bias, coord_para.xy);
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    mean_vari.y = read_imagef(meanVari, coord_para.wy).x;
+    float4 data = convert_float4(read_imageui(input, coord));
+
+    float scale_vari, bias_val;
+    float scale_inOut = input_scale * output_scale;
+
+    scale_vari = gamma.s0 * mean_vari.s1;
+    float alpha = scale_inOut * scale_vari;
+    bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;
+
+    data.x -= input_zp;
+    float4 norm;
+    norm.x = data.x * alpha + bias_val;
+    write_imagef(output, coord, norm);
+}
+
+__kernel void group_norm_U8toF32_2D(
+    __read_only image2d_t   input,
+    __read_only image2d_t   bias,
+    __read_only image2d_t   scale,
+    __read_only image2d_t   meanVari,
+    __write_only image2d_t  output,
+    float eps,
+    int is2d,
+    float input_zp,
+    float input_scale,
+    float output_zp,
+    float output_scale,
+    float rSpaceOrg,
+    int width,
+    int height,
+    int pStride
+    )
+{
+    int gidz = get_global_id(1);
+    int2 coord = (int2)(get_global_id(0), gidz);
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 1);
+
+    float4 gamma = read_imagef(scale, coord_para.xy);
+    float4 beta  = read_imagef(bias, coord_para.xy);
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    mean_vari.y = read_imagef(meanVari, coord_para.wy).x;
+    float4 data = convert_float4(read_imageui(input, coord));
+
+    float scale_vari, bias_val;
+    float scale_inOut = input_scale * output_scale;
+
+    scale_vari = gamma.s0 * mean_vari.s1;
+    float alpha = scale_inOut * scale_vari;
+    bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;
+
+    data.x -= input_zp;
+    float4 norm;
+    norm.x = data.x * alpha + bias_val;
+    write_imagef(output, coord, norm);
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl
index 70a81da..9efcd9e 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul.cl
@@ -143,7 +143,7 @@ __kernel void gemm_transb_F32F32toF32_3D(
 
     coord_a.x = get_global_id(0);
     coord_a.z = get_global_id(2);
-    write_imagef(output, coord_b, sum);
+    write_imagef(output, coord_a, sum);
 }
 
 __kernel void gemm_transb_F32I8toF32_2D(
@@ -219,5 +219,5 @@ __kernel void gemm_transb_F32I8toF32_3D(
 
     coord_a.x = get_global_id(0);
     coord_a.z = get_global_id(2);
-    write_imagef(output, coord_b, sum);
+    write_imagef(output, coord_a, sum);
 }
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl b/src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl
new file mode 100644
index 0000000..d186c41
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/one_hot.cl
@@ -0,0 +1,130 @@
+__kernel void one_hot_F32toF32
+    (
+        __read_only  image2d_t       input,
+        __write_only image2d_array_t output,
+                     int             depth,
+                     float           on_value,
+                     float           off_value,
+                     float           inputScale,
+                     float           inputTail
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0);
+
+    float4 val = read_imagef(input, coord.xy);
+
+    do
+    {
+        float4 dst;
+        dst.x = convert_int(val.x) == coord.z ? on_value : off_value;
+
+        write_imagef(output, coord.xzyw, dst.xxxx);
+
+        coord.z ++;
+    } while (coord.z < depth);
+}
+
+__kernel void one_hot_I32toI32
+    (
+        __read_only  image2d_t       input,
+        __write_only image2d_array_t output,
+                     int             depth,
+                     int             on_value,
+                     int             off_value,
+                     float           inputScale,
+                     float           inputTail
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0);
+
+    int4 val = read_imagei(input, coord.xy);
+
+    do
+    {
+        int4 dst;
+        dst.x = val.x == coord.z ? on_value : off_value;
+
+        write_imagei(output, coord.xzyw, dst.xxxx);
+
+        coord.z ++;
+    } while (coord.z < depth);
+}
+
+__kernel void one_hot_I32toU8
+    (
+        __read_only  image2d_t       input,
+        __write_only image2d_array_t output,
+                     int             depth,
+                     uint            on_value,
+                     uint            off_value,
+                     float           inputScale,
+                     float           inputTail
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0);
+
+    int4 val = read_imagei(input, coord.xy);
+    do
+    {
+        uint4 dst;
+        dst.x = val.x == coord.z ? on_value : off_value;
+
+        write_imageui(output, coord.xzyw, dst.xxxx);
+
+        coord.z ++;
+    } while (coord.z < depth);
+}
+
+__kernel void one_hot_I32toF32
+    (
+        __read_only  image2d_t       input,
+        __write_only image2d_array_t output,
+                     int             depth,
+                     float           on_value,
+                     float           off_value,
+                     float           inputScale,
+                     float           inputTail
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0);
+
+    int4 val = read_imagei(input, coord.xy);
+
+    do
+    {
+        float4 dst;
+        dst.x = val.x == coord.z ? on_value : off_value;
+
+        write_imagef(output, coord.xzyw, dst.xxxx);
+
+        coord.z ++;
+    } while (coord.z < depth);
+}
+
+__kernel void one_hot_U8toU8
+    (
+        __read_only  image2d_t       input,
+        __write_only image2d_array_t output,
+                     int             depth,
+                     uint            on_value,
+                     uint            off_value,
+                     float           inputScale,
+                     float           inputTail
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0);
+
+    uint4 src = read_imageui(input, coord.xy);
+
+    int  val = convert_int(convert_float(src.x) * inputScale - inputTail);
+
+    do
+    {
+        uint4 dst;
+        dst.x = val == coord.z ? on_value : off_value;
+
+        write_imageui(output, coord.xzyw, dst.xxxx);
+
+        coord.z ++;
+    } while (coord.z < depth);
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/repeat.cl b/src/tim/vx/internal/src/libnnext/ops/cl/repeat.cl
new file mode 100644
index 0000000..2492a9c
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/repeat.cl
@@ -0,0 +1,176 @@
+__kernel void repeat_I32_axis0(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int width, int height, int channel, int axis)
+{
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    for(coord.y = 0; coord.y < height;)
+    {
+        int4 data = read_imagei(input0, coord);
+        int4 len = read_imagei(input1, coord.yw);
+        coord.y++;
+        for(int i = 0; i < len.x; i++)
+        {
+            write_imagei(output, coord_out, data);
+            coord_out.y++;
+        }
+    }
+}
+
+__kernel void repeat_I32_axis1(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int width, int height, int channel, int axis)
+{
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    for(coord.x = 0; coord.x < width;)
+    {
+        int4 data = read_imagei(input0, coord);
+        int4 len = read_imagei(input1, coord.xw);
+        coord.x++;
+        for(int i = 0; i < len.x; i++)
+        {
+            write_imagei(output, coord_out, data);
+            coord_out.x++;
+        }
+    }
+}
+
+__kernel void repeat_I32_axis2(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int width, int height, int channel, int axis)
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_out = coord;
+
+    for(coord.z = 0; coord.z < channel;)
+    {
+        int4 data = read_imagei(input0, coord);
+        int4 len = read_imagei(input1, coord.zw);
+        coord.z++;
+        for(int i = 0; i < len.x; i++)
+        {
+            write_imagei(output, coord_out, data);
+            coord_out.z++;
+        }
+    }
+}
+
+__kernel void repeat_I32_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int width, int height, int channel, int axis)
+{
+    int2 coord = (int2)(0, 0);
+    int2 coord_out = coord;
+
+    for(coord.x = 0; coord.x < width;)
+    {
+        int4 data = read_imagei(input0, coord);
+        int4 len = read_imagei(input1, coord.xy);
+        coord.x++;
+        for(int i = 0; i < len.x; i++)
+        {
+            write_imagei(output, coord_out, data);
+            coord_out.x++;
+        }
+    }
+}
+
+__kernel void repeat_F32_axis0(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int width, int height, int channel, int axis)
+{
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    for(coord.y = 0; coord.y < height;)
+    {
+        float4 data = read_imagef(input0, coord);
+        int4 len = read_imagei(input1, coord.yw);
+        coord.y++;
+        for(int i = 0; i < len.x; i++)
+        {
+            write_imagef(output, coord_out, data);
+            coord_out.y++;
+        }
+    }
+}
+
+__kernel void repeat_F32_axis1(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int width, int height, int channel, int axis)
+{
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    for(coord.x = 0; coord.x < width;)
+    {
+        float4 data = read_imagef(input0, coord);
+        int4 len = read_imagei(input1, coord.xw);
+        coord.x++;
+        for(int i = 0; i < len.x; i++)
+        {
+            write_imagef(output, coord_out, data);
+            coord_out.x++;
+        }
+    }
+}
+
+__kernel void repeat_F32_axis2(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int width, int height, int channel, int axis)
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_out = coord;
+
+    for(coord.z = 0; coord.z < channel;)
+    {
+        float4 data = read_imagef(input0, coord);
+        int4 len = read_imagei(input1, coord.zw);
+        coord.z++;
+        for(int i = 0; i < len.x; i++)
+        {
+            write_imagef(output, coord_out, data);
+            coord_out.z++;
+        }
+    }
+}
+
+__kernel void repeat_F32_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int width, int height, int channel, int axis)
+{
+    int2 coord = (int2)(0, 0);
+    int2 coord_out = coord;
+
+    for(coord.x = 0; coord.x < width;)
+    {
+        float4 data = read_imagef(input0, coord);
+        int4 len = read_imagei(input1, coord.xy);
+        coord.x++;
+        for(int i = 0; i < len.x; i++)
+        {
+            write_imagef(output, coord_out, data);
+            coord_out.x++;
+        }
+    }
+}
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/sequence_mask.cl b/src/tim/vx/internal/src/libnnext/ops/cl/sequence_mask.cl
new file mode 100644
index 0000000..4813eb7
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/sequence_mask.cl
@@ -0,0 +1,72 @@
+
+__kernel void sequence_mask_I32toU8(
+    image2d_t input, image2d_array_t output, int maxLen,
+    float input_scale, float input_zpScale, float outputVal1, int output_ZP)
+{
+    int gidx = get_global_id(0);
+    int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0);
+    int4 index = read_imagei(input, coord.yz);
+    uint4 data;
+    data.x = gidx < index.x ? convert_uint_rte(outputVal1) : (uint)(output_ZP);
+    write_imageui(output, coord, data);
+}
+
+__kernel void sequence_mask_I32toU8_2D(
+    image2d_t input, image2d_t output, int maxLen,
+    float input_scale, float input_zpScale, float outputVal1, int output_ZP)
+{
+    int gidx = get_global_id(0);
+    int2 coord = (int2)(gidx, get_global_id(1));
+    int4 index = read_imagei(input, coord.yy);
+    uint4 data;
+    data.x = gidx < index.x ? convert_uint_rte(outputVal1) : (uint)(output_ZP);
+    write_imageui(output, coord, data);
+}
+
+__kernel void sequence_mask_I32toI32(
+    image2d_t input, image2d_array_t output, int maxLen,
+    float input_scale, float input_zpScale, float outputVal1, int output_ZP)
+{
+    int gidx = get_global_id(0);
+    int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0);
+    int4 index = read_imagei(input, coord.yz);
+    int4 data;
+    data = gidx < index.x ? (int4)(1) : (int4)(0);
+    write_imagei(output, coord, data);
+}
+
+__kernel void sequence_mask_I32toI32_2D(
+    image2d_t input, image2d_t output, int maxLen,
+    float input_scale, float input_zpScale, float outputVal1, int output_ZP)
+{
+    int gidx = get_global_id(0);
+    int2 coord = (int2)(gidx, get_global_id(1));
+    int4 index = read_imagei(input, coord.yy);
+    int4 data;
+    data = gidx < index.x ? (int4)(1) : (int4)(0);
+    write_imagei(output, coord, data);
+}
+
+__kernel void sequence_mask_I32toF32(
+    image2d_t input, image2d_array_t output, int maxLen,
+    float input_scale, float input_zpScale, float outputVal1, int output_ZP)
+{
+    int gidx = get_global_id(0);
+    int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0);
+    int4 index = read_imagei(input, coord.yz);
+    float4 data;
+    data = gidx < index.x ? (float4)(1.0f) : (float4)(0.0f);
+    write_imagef(output, coord, data);
+}
+
+__kernel void sequence_mask_I32toF32_2D(
+    image2d_t input, image2d_t output, int maxLen,
+    float input_scale, float input_zpScale, float outputVal1, int output_ZP)
+{
+    int gidx = get_global_id(0);
+    int2 coord = (int2)(gidx, get_global_id(1));
+    int4 index = read_imagei(input, coord.yy);
+    float4 data;
+    data = gidx < index.x ? (float4)(1.0f) : (float4)(0.0f);
+    write_imagef(output, coord, data);
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/slice.cl b/src/tim/vx/internal/src/libnnext/ops/cl/slice.cl
new file mode 100644
index 0000000..764aca2
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/slice.cl
@@ -0,0 +1,144 @@
+__kernel void slice_F32_I32toF32
+    (
+    __read_only  image2d_array_t input0,
+    __read_only  image2d_t       input1,
+    __write_only image2d_array_t output,
+                 float           inputScale,
+                 float           inputTail,
+                 float           outputScale,
+                 float           outputZP
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_in;
+    Image begin_img = create_image_from_image2d(input1, 4);
+    uchar* begin_ptr = begin_img.ptr;
+    int4 begin = ((int4 *)begin_ptr)[0];
+
+    coord_in = coord + begin;
+    float4 src = read_imagef(input0, coord_in);
+
+    write_imagef(output, coord, src);
+}
+
+__kernel void slice_F32_I32toF32_2D
+    (
+    __read_only  image2d_t input0,
+    __read_only  image2d_t input1,
+    __write_only image2d_t output,
+                 float           inputScale,
+                 float           inputTail,
+                 float           outputScale,
+                 float           outputZP
+    )
+{
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
+    int2 coord_in;
+    Image begin_img = create_image_from_image2d(input1, 4);
+    uchar* begin_ptr = begin_img.ptr;
+    int2 begin = ((int2 *)begin_ptr)[0];
+
+    coord_in = coord + begin;
+    float4 src = read_imagef(input0, coord_in);
+
+    write_imagef(output, coord, src);
+}
+
+__kernel void slice_U8_I32toU8
+    (
+    __read_only  image2d_array_t input0,
+    __read_only  image2d_t       input1,
+    __write_only image2d_array_t output,
+                 float           inputScale,
+                 float           inputTail,
+                 float           outputScale,
+                 float           outputZP
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_in;
+    Image begin_img = create_image_from_image2d(input1, 4);
+    uchar* begin_ptr = begin_img.ptr;
+    int4 begin = ((int4 *)begin_ptr)[0];
+
+    coord_in = coord + begin;
+    uint4 src = read_imageui(input0, coord_in);
+
+    float4 data = convert_float4(src) * inputScale - inputTail;
+    uint4 dst = convert_uint4(data * outputScale + outputZP);
+
+    write_imageui(output, coord, dst);
+}
+
+__kernel void slice_U8_I32toU8_2D
+    (
+    __read_only  image2d_t input0,
+    __read_only  image2d_t input1,
+    __write_only image2d_t output,
+                 float           inputScale,
+                 float           inputTail,
+                 float           outputScale,
+                 float           outputZP
+    )
+{
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
+    int2 coord_in;
+    Image begin_img = create_image_from_image2d(input1, 4);
+    uchar* begin_ptr = begin_img.ptr;
+    int2 begin = ((int2 *)begin_ptr)[0];
+
+    coord_in = coord + begin;
+    uint4 src = read_imageui(input0, coord_in);
+
+    float4 data = convert_float4(src) * inputScale - inputTail;
+    uint4 dst = convert_uint4(data * outputScale + outputZP);
+
+    write_imageui(output, coord, dst);
+}
+
+__kernel void slice_I32_I32toI32
+    (
+    __read_only  image2d_array_t input0,
+    __read_only  image2d_t       input1,
+    __write_only image2d_array_t output,
+                 float           inputScale,
+                 float           inputTail,
+                 float           outputScale,
+                 float           outputZP
+    )
+{
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_in;
+    Image begin_img = create_image_from_image2d(input1, 4);
+    uchar* begin_ptr = begin_img.ptr;
+    int4 begin = ((int4 *)begin_ptr)[0];
+
+    coord_in = coord + begin;
+    int4 src = read_imagei(input0, coord_in);
+
+    write_imagei(output, coord, src);
+}
+
+__kernel void slice_I32_I32toI32_2D
+    (
+    __read_only  image2d_t input0,
+    __read_only  image2d_t input1,
+    __write_only image2d_t output,
+                 float           inputScale,
+                 float           inputTail,
+                 float           outputScale,
+                 float           outputZP
+    )
+{
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));
+    int2 coord_in;
+    Image begin_img = create_image_from_image2d(input1, 4);
+    uchar* begin_ptr = begin_img.ptr;
+    int2 begin = ((int2 *)begin_ptr)[0];
+
+    coord_in = coord + begin;
+    int4 src = read_imagei(input0, coord_in);
+
+    write_imagei(output, coord, src);
+}
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_axis_aligned_bbox_transform.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_axis_aligned_bbox_transform.c
deleted file mode 100644
index 2755dc8..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_axis_aligned_bbox_transform.c
+++ /dev/null
@@ -1,275 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include <float.h>
-#include <math.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_test.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-#define _VX_KERNEL_VAR          (vx_kernel_AXIS_ALIGNED_BBOX_TRANSFORM)
-#define _VX_KERNEL_ID           (VX_KERNEL_ENUM_AXIS_ALIGNED_BBOX_TRANSFORM)
-#define _VX_KERNEL_NAME         (VX_KERNEL_NAME_AXIS_ALIGNED_BBOX_TRANSFORM)
-#define _VX_KERNEL_FUNC_KERNEL  (vxAxis_aligned_bbox_transformKernel)
-
-typedef struct
-{
-    float x1, y1, x2, y2;
-}BoxEncodingCorner;
-typedef struct
-{
-    float w, h, x, y;
-}BoxEncodingCenter;
-
-void toBoxEncodingCorner
-    (
-    BoxEncodingCenter* ctr,
-    BoxEncodingCorner* cnr
-    )
-{
-    cnr->x1 = ctr->x - ctr->w / 2;
-    cnr->y1 = ctr->y - ctr->h / 2;
-    cnr->x2 = ctr->x + ctr->w / 2;
-    cnr->y2 = ctr->y + ctr->h / 2;
-}
-
-void toBoxEncodingCenter
-    (
-    BoxEncodingCorner* cnr,
-    BoxEncodingCenter* ctr
-    )
-{
-    ctr->w = cnr->x2 - cnr->x1;
-    ctr->h = cnr->y2 - cnr->y1;
-    ctr->x = (cnr->x1 + cnr->x2) / 2;
-    ctr->y = (cnr->y1 + cnr->y2) / 2;
-}
-
-static vsi_status VX_CALLBACK vxAxis_aligned_bbox_transformKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-#define ARG_NUM            (0)
-#define TENSOR_NUM_INPUT (4)
-#define TENSOR_NUM_OUTPUT (1)
-#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT)
-
-    vsi_status status = VSI_FAILURE;
-    vx_context context = NULL;
-    vx_tensor input[TENSOR_NUM_INPUT] = {0};
-    vx_tensor output[TENSOR_NUM_OUTPUT] = {0};
-    float *f32_in_buffer[TENSOR_NUM_INPUT] = {0};
-    int32_t* int32_in_buffer[TENSOR_NUM_INPUT] = {0};
-    float *f32_out_buffer[TENSOR_NUM_OUTPUT] = {0};
-    vsi_nn_tensor_attr_t in_attr[TENSOR_NUM_INPUT];
-    vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT];
-    uint32_t in_elements[TENSOR_NUM_INPUT] = {0};
-    uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0};
-
-    int32_t i;
-    for(i = 0; i < TENSOR_NUM_INPUT; i++)
-    {
-        memset(&in_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
-    }
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
-    {
-        memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
-    }
-    /* prepare data */
-    context = vxGetContext((vx_reference)node);
-
-    for(i = 0; i < TENSOR_NUM_INPUT; i ++)
-    {
-        input[i] = (vx_tensor)paramObj[i];
-        status = vsi_nn_vxGetTensorAttr(input[i], &in_attr[i]);
-        TEST_CHECK_STATUS(status, final);
-        in_elements[i] = vsi_nn_vxGetTensorElementNum(&in_attr[i]);
-        if (i == 2)
-        {
-            int32_in_buffer[i] = (int32_t *)vsi_nn_vxCopyTensorToData(context,
-                input[i], &in_attr[i]);
-        }
-        else
-        {
-            f32_in_buffer[i] = (float *)malloc(in_elements[i] * sizeof(float));
-            status = vsi_nn_vxConvertTensorToFloat32Data(
-                context, input[i], &in_attr[i], f32_in_buffer[i],
-                in_elements[i] * sizeof(float));
-            TEST_CHECK_STATUS(status, final);
-        }
-    }
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i ++)
-    {
-        output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT];
-        status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]);
-        TEST_CHECK_STATUS(status, final);
-        out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]);
-        f32_out_buffer[i]= (float *)malloc(out_elements[i] * sizeof(float));
-        memset(f32_out_buffer[i], 0, out_elements[i] * sizeof(float));
-    }
-
-    /* TODO: Add CPU kernel implement */
-    {
-        const uint32_t roiLength = 4;
-        const uint32_t imageLength = 2;
-
-        uint32_t numClasses = in_attr[1].size[0] / roiLength;
-        uint32_t numRois = in_attr[0].size[1];
-        uint32_t j;
-        uint32_t roiIndex;
-        for(roiIndex = 0; roiIndex < numRois; roiIndex++)
-        {
-            uint32_t batchIndex = int32_in_buffer[2][roiIndex];
-            float imageHeight = f32_in_buffer[3][batchIndex * imageLength];
-            float imageWidth = f32_in_buffer[3][batchIndex * imageLength + 1];
-            BoxEncodingCorner roi_cnr;
-            BoxEncodingCenter roiBefore;
-            roi_cnr.x1 = f32_in_buffer[0][roiIndex * roiLength];
-            roi_cnr.y1 = f32_in_buffer[0][roiIndex * roiLength + 1];
-            roi_cnr.x2 = f32_in_buffer[0][roiIndex * roiLength + 2];
-            roi_cnr.y2 = f32_in_buffer[0][roiIndex * roiLength + 3];
-            toBoxEncodingCenter(&roi_cnr, &roiBefore);
-            for (j = 0; j < numClasses; j++)
-            {
-                BoxEncodingCenter roi_ctr;
-                BoxEncodingCorner roiAfter;
-                BoxEncodingCorner cliped;
-                uint32_t index = (roiIndex * numClasses + j) * roiLength;
-                roi_ctr.w = (float)(exp(f32_in_buffer[1][index + 2]) * roiBefore.w);
-                roi_ctr.h = (float)(exp(f32_in_buffer[1][index + 3]) * roiBefore.h);
-                roi_ctr.x = roiBefore.x + f32_in_buffer[1][index] * roiBefore.w;
-                roi_ctr.y = roiBefore.y + f32_in_buffer[1][index + 1] * roiBefore.h;
-                toBoxEncodingCorner(&roi_ctr, &roiAfter);
-                cliped.x1 = vsi_nn_min(vsi_nn_max(roiAfter.x1, 0.0f), imageWidth);
-                cliped.y1 = vsi_nn_min(vsi_nn_max(roiAfter.y1, 0.0f), imageHeight);
-                cliped.x2 = vsi_nn_min(vsi_nn_max(roiAfter.x2, 0.0f), imageWidth);
-                cliped.y2 = vsi_nn_min(vsi_nn_max(roiAfter.y2, 0.0f), imageHeight);
-                f32_out_buffer[0][index] = cliped.x1;
-                f32_out_buffer[0][index + 1] = cliped.y1;
-                f32_out_buffer[0][index + 2] = cliped.x2;
-                f32_out_buffer[0][index + 3] = cliped.y2;
-            }
-        }
-    }
-
-    /* save data */
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
-    {
-        status = vsi_nn_vxConvertFloat32DataToTensor(
-            context, output[i], &out_attr[i], f32_out_buffer[i],
-            out_elements[i] * sizeof(float));
-        TEST_CHECK_STATUS(status, final);
-    }
-
-final:
-    for (i = 0; i < TENSOR_NUM_INPUT; i++)
-    {
-        if (f32_in_buffer[i]) free(f32_in_buffer[i]);
-        if (int32_in_buffer[i]) free(int32_in_buffer[i]);
-    }
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
-    {
-        if (f32_out_buffer[i]) free(f32_out_buffer[i]);
-    }
-    return status;
-} /* _VX_KERNEL_FUNC_KERNEL() */
-
-static vx_param_description_t vxAxis_aligned_bbox_transformKernelParam[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-};
-
-vx_status VX_CALLBACK vxAxis_aligned_bbox_transformInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    vx_uint32 paraNum
-    )
-{
-    vx_status status = VX_SUCCESS;
-    /*TODO: Add initial code for VX program*/
-
-    return status;
-}
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t vxAxis_aligned_bbox_transform_CPU =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    _VX_KERNEL_FUNC_KERNEL,
-    vxAxis_aligned_bbox_transformKernelParam,
-    _cnt_of_array( vxAxis_aligned_bbox_transformKernelParam ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxAxis_aligned_bbox_transform_VX =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    NULL,
-    vxAxis_aligned_bbox_transformKernelParam,
-    _cnt_of_array( vxAxis_aligned_bbox_transformKernelParam ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxAxis_aligned_bbox_transformInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_AXIS_ALIGNED_BBOX_TRANSFORM_list[] =
-{
-    &vxAxis_aligned_bbox_transform_CPU,
-    &vxAxis_aligned_bbox_transform_VX,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c
index 8114caf..f4b6949 100644
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_box_with_nms_limit.c
@@ -38,7 +38,7 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_math.h"
 #include "utils/vsi_nn_link_list.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "libnnext/vx_lib_nnext.h"
 
 #define _VX_KERNEL_VAR          (vx_kernel_BOX_WITH_NMS_LIMIT)
diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c
index 34de7f7..f14c2f6 100644
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_extra_ending.c
@@ -35,7 +35,7 @@
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "libnnext/vx_lib_nnext.h"
 
 #define _VX_KERNEL_VAR          (vx_kernel_EXTRA_ENDING)
diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_generate_proposals.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_generate_proposals.c
deleted file mode 100644
index 0c2b948..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_generate_proposals.c
+++ /dev/null
@@ -1,483 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-#include <float.h>
-#include <math.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_test.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-#define _VX_KERNEL_VAR          (vx_kernel_GENERATE_PROPOSALS)
-#define _VX_KERNEL_ID           (VX_KERNEL_ENUM_GENERATE_PROPOSALS)
-#define _VX_KERNEL_NAME         (VX_KERNEL_NAME_GENERATE_PROPOSALS)
-#define _VX_KERNEL_FUNC_KERNEL  (vxGenerate_proposalsKernel)
-
-typedef struct
-{
-    float x1, y1, x2, y2;
-}BoxEncodingCorner;
-typedef struct
-{
-    float w, h, x, y;
-}BoxEncodingCenter;
-
-// toBoxEncodingCorner is implemented in vsi_nn_kernel_box_with_nms_limit.c
-void toBoxEncodingCorner
-    (
-    BoxEncodingCenter* ctr,
-    BoxEncodingCorner* cnr
-    );
-
-// toBoxEncodingCenter is implemented in vsi_nn_kernel_box_with_nms_limit.c
-void toBoxEncodingCenter
-    (
-    BoxEncodingCorner* cnr,
-    BoxEncodingCenter* ctr
-    );
-
-// iota is implemented in vsi_nn_kernel_detection_postprocess.c
-static void _iota
-    (
-    int32_t * data,
-    uint32_t len,
-    int32_t value
-    )
-{
-    uint32_t i;
-    for (i = 0; i < len; i++)
-    {
-        data [i] = value;
-        value++;
-    }
-}
-
-// swap_element is implemented in vsi_nn_kernel_box_with_nms_limit.c
-void swap_element
-    (
-    uint32_t* list,
-    uint32_t first,
-    uint32_t second
-    );
-
-// max_element is implemented in vsi_nn_kernel_box_with_nms_limit.c
-uint32_t max_element
-    (
-    float* data,
-    uint32_t* index_list,
-    uint32_t len
-    );
-
-// getIoUAxisAligned is implemented in vsi_nn_kernel_box_with_nms_limit.c
-float getIoUAxisAligned
-    (
-    const float* roi1,
-    const float* roi2
-    );
-
-// sort_element_by_score is implemented in vsi_nn_kernel_box_with_nms_limit.c
-void sort_element_by_score
-    (
-    float* data,
-    uint32_t* index_list,
-    uint32_t len
-    );
-
-void filterBoxes
-    (
-    const float* roiBase,
-    const float* imageInfoBase,
-    float minSize,
-    uint32_t* select,
-    uint32_t* len
-    )
-{
-    const uint32_t kRoiDim = 4;
-    uint32_t i = 0;
-    uint32_t j;
-    for(j = 0; j < *len; j++)
-    {
-        const float* roiInfo = roiBase + select[j] * kRoiDim;
-        float roiWidth, roiHeight, xRoiCenter, yRoiCenter;
-        roiWidth = roiInfo[2] - roiInfo[0];
-        roiHeight = roiInfo[3] - roiInfo[1];
-        xRoiCenter = roiInfo[0] + roiWidth / 2.0f;
-        yRoiCenter = roiInfo[1] + roiHeight / 2.0f;
-        if(roiWidth > minSize && roiHeight > minSize && xRoiCenter < imageInfoBase[1]
-            && yRoiCenter < imageInfoBase[0])
-        {
-            select[i] = select[j];
-            i++;
-        }
-    }
-    *len = i;
-}
-
-static vsi_status VX_CALLBACK vxGenerate_proposalsKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-#define ARG_NUM            (6)
-#define TENSOR_NUM_INPUT (4)
-#define TENSOR_NUM_OUTPUT (3)
-#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT)
-
-    vsi_status status = VSI_FAILURE;
-    vx_context context = NULL;
-    vx_tensor input[TENSOR_NUM_INPUT] = {0};
-    vx_tensor output[TENSOR_NUM_OUTPUT] = {0};
-    float *f32_in_buffer[TENSOR_NUM_INPUT] = {0};
-    float *f32_out_buffer[TENSOR_NUM_OUTPUT] = {0};
-    int32_t* int32_out_buffer[TENSOR_NUM_OUTPUT] = {0};
-    vsi_nn_tensor_attr_t in_attr[TENSOR_NUM_INPUT];
-    vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT];
-    uint32_t in_elements[TENSOR_NUM_INPUT] = {0};
-    uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0};
-
-    float heightStride;
-    float widthStride;
-    int32_t preNmsTopN;
-    int32_t postNmsTopN;
-    float iouThreshold;
-    float minSize;
-
-    uint32_t i = 0;
-    for(i = 0; i < TENSOR_NUM_INPUT; i++)
-    {
-        memset(&in_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
-    }
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
-    {
-        memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
-    }
-    /* prepare data */
-    context = vxGetContext((vx_reference)node);
-
-    for(i = 0; i < TENSOR_NUM_INPUT; i ++)
-    {
-        input[i] = (vx_tensor)paramObj[i];
-        status = vsi_nn_vxGetTensorAttr(input[i], &in_attr[i]);
-        TEST_CHECK_STATUS(status, final);
-        in_elements[i] = vsi_nn_vxGetTensorElementNum(&in_attr[i]);
-        f32_in_buffer[i] = (float *)malloc(in_elements[i] * sizeof(float));
-        status = vsi_nn_vxConvertTensorToFloat32Data(
-            context, input[i], &in_attr[i], f32_in_buffer[i],
-            in_elements[i] * sizeof(float));
-        TEST_CHECK_STATUS(status, final);
-    }
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i ++)
-    {
-        output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT];
-        status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]);
-        TEST_CHECK_STATUS(status, final);
-        out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]);
-        if(i < 2)
-        {
-            f32_out_buffer[i] = (float *)malloc(out_elements[i] * sizeof(float));
-            memset(f32_out_buffer[i], 0, out_elements[i] * sizeof(float));
-        }
-        else
-        {
-            int32_out_buffer[i] = (int32_t *)malloc(out_elements[i] * sizeof(int32_t));
-            memset(int32_out_buffer[i], 0, out_elements[i] * sizeof(int32_t));
-        }
-    }
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(heightStride),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 1], &(widthStride),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 2], &(preNmsTopN),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 3], &(postNmsTopN),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 4], &(iouThreshold),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM + 5], &(minSize),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-
-    /* TODO: Add CPU kernel implement */
-    {
-        uint32_t h, w, a, b, j;
-        const uint32_t kRoiDim = 4;
-        uint32_t numBatches = in_attr[0].size[3];
-        uint32_t height = in_attr[0].size[2];
-        uint32_t width = in_attr[0].size[1];
-        uint32_t numAnchors = in_attr[0].size[0];
-        uint32_t imageInfoLength = in_attr[3].size[0];
-
-        uint32_t batchSize = height * width * numAnchors;
-        uint32_t roiBufferSize = batchSize * kRoiDim;
-
-        float * roiBuffer = (float*)malloc(roiBufferSize * sizeof(float));
-        float * roiTransformedBuffer = (float*)malloc(roiBufferSize * sizeof(float));
-        uint32_t* select = (uint32_t*)malloc(batchSize * sizeof(uint32_t));
-        uint32_t index = 0;
-        uint32_t scores_index = 0;
-        uint32_t bboxDeltas_index = 0;
-        uint32_t imageInfo_index = 0;
-        uint32_t scores_out_index = 0;
-        uint32_t roi_out_index = 0;
-
-        // Compute the roi region for each anchor.
-        for(h = 0; h < height; h++)
-        {
-            float hShift = h * heightStride;
-            for(w = 0; w < width; w++)
-            {
-                float wShift = w * widthStride;
-                uint32_t anchor_index = 0;
-                for(a = 0; a < numAnchors; a++)
-                {
-                    roiBuffer[index] = f32_in_buffer[2][anchor_index] + wShift;
-                    roiBuffer[index + 1] = f32_in_buffer[2][anchor_index + 1] + hShift;
-                    roiBuffer[index + 2] = f32_in_buffer[2][anchor_index + 2] + wShift;
-                    roiBuffer[index + 3] = f32_in_buffer[2][anchor_index + 3] + hShift;
-
-                    index += kRoiDim;
-                    anchor_index += kRoiDim;
-                }
-            }
-        }
-
-        for(b = 0; b < numBatches; b++)
-        {
-            const uint32_t roiLength = 4;
-
-            uint32_t numRois = batchSize;
-            uint32_t roiIndex;
-            uint32_t select_len;
-            int32_t numDetections = 0;
-            for(roiIndex = 0; roiIndex < numRois; roiIndex++)
-            {
-                float imageHeight = f32_in_buffer[3][imageInfo_index];
-                float imageWidth = f32_in_buffer[3][imageInfo_index + 1];
-                BoxEncodingCorner roi_cnr;
-                BoxEncodingCenter roiBefore;
-                roi_cnr.x1 = roiBuffer[roiIndex * roiLength];
-                roi_cnr.y1 = roiBuffer[roiIndex * roiLength + 1];
-                roi_cnr.x2 = roiBuffer[roiIndex * roiLength + 2];
-                roi_cnr.y2 = roiBuffer[roiIndex * roiLength + 3];
-                toBoxEncodingCenter(&roi_cnr, &roiBefore);
-                {
-                    BoxEncodingCenter roi_ctr;
-                    BoxEncodingCorner roiAfter;
-                    BoxEncodingCorner cliped;
-                    uint32_t idx = bboxDeltas_index + roiIndex * roiLength;
-                    roi_ctr.w = (float)(exp(f32_in_buffer[1][idx + 2]) * roiBefore.w);
-                    roi_ctr.h = (float)(exp(f32_in_buffer[1][idx + 3]) * roiBefore.h);
-                    roi_ctr.x = roiBefore.x + f32_in_buffer[1][idx] * roiBefore.w;
-                    roi_ctr.y = roiBefore.y + f32_in_buffer[1][idx + 1] * roiBefore.h;
-                    toBoxEncodingCorner(&roi_ctr, &roiAfter);
-                    cliped.x1 = vsi_nn_min(vsi_nn_max(roiAfter.x1, 0.0f), imageWidth);
-                    cliped.y1 = vsi_nn_min(vsi_nn_max(roiAfter.y1, 0.0f), imageHeight);
-                    cliped.x2 = vsi_nn_min(vsi_nn_max(roiAfter.x2, 0.0f), imageWidth);
-                    cliped.y2 = vsi_nn_min(vsi_nn_max(roiAfter.y2, 0.0f), imageHeight);
-                    roiTransformedBuffer[idx] = cliped.x1;
-                    roiTransformedBuffer[idx + 1] = cliped.y1;
-                    roiTransformedBuffer[idx + 2] = cliped.x2;
-                    roiTransformedBuffer[idx + 3] = cliped.y2;
-                }
-            }
-
-            // Find the top preNmsTopN scores.
-            _iota((int32_t*)select, batchSize, 0);
-            select_len = batchSize;
-            if(preNmsTopN > 0 && preNmsTopN < (int32_t)batchSize)
-            {
-                sort_element_by_score(&(f32_in_buffer[0][scores_index]),
-                    select, batchSize);
-                select_len = preNmsTopN;
-            }
-
-            // Filter boxes, disgard regions with height or width < minSize.
-            filterBoxes(roiTransformedBuffer, &(f32_in_buffer[3][0]),
-                minSize, select, &select_len);
-
-            // Apply hard NMS.
-            if(postNmsTopN < 0)
-            {
-                postNmsTopN = select_len;
-            }
-
-            for(j = 0; (j < select_len && numDetections < postNmsTopN); j++)
-            {
-                // find max score and swap to the front.
-                int32_t max_index = max_element(&(f32_in_buffer[0][scores_index]),
-                    &(select[j]), select_len - j) + j;
-                swap_element(select, max_index, j);
-
-                // Calculate IoU of the rest, swap to the end (disgard) ifneeded.
-                for(i = j + 1; i < select_len; i++)
-                {
-                    int32_t roiBase0 = select[i] * kRoiDim;
-                    int32_t roiBase1 = select[j] * kRoiDim;
-                    float iou = getIoUAxisAligned(&(roiTransformedBuffer[roiBase0]),
-                        &(roiTransformedBuffer[roiBase1]));
-
-                    if(iou >= iouThreshold)
-                    {
-                        swap_element(select, i, select_len - 1);
-                        i--;
-                        select_len--;
-                    }
-                }
-                numDetections++;
-            }
-
-            for(i = 0; i < select_len; i++)
-            {
-                memcpy(&(f32_out_buffer[1][roi_out_index]),
-                    &(roiTransformedBuffer[select[i] * kRoiDim]), kRoiDim * sizeof(float));
-                f32_out_buffer[0][scores_out_index] =
-                    f32_in_buffer[0][scores_index + select[i]];
-                int32_out_buffer[2][scores_out_index] = b;
-                scores_out_index++;
-                roi_out_index += kRoiDim;
-            }
-
-            scores_index += batchSize;
-            bboxDeltas_index += roiBufferSize;
-            imageInfo_index += imageInfoLength;
-        }
-
-        vsi_nn_safe_free(roiBuffer);
-        vsi_nn_safe_free(roiTransformedBuffer);
-        vsi_nn_safe_free(select);
-    }
-
-    /* save data */
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
-    {
-        if(i < 2)
-        {
-            status = vsi_nn_vxConvertFloat32DataToTensor(
-                context, output[i], &out_attr[i], f32_out_buffer[i],
-                out_elements[i] * sizeof(float));
-            TEST_CHECK_STATUS(status, final);
-        }
-        else
-        {
-            vsi_nn_vxCopyDataToTensor(context, output[i], &out_attr[i],
-                (uint8_t *)int32_out_buffer[i]);
-        }
-    }
-
-final:
-    for(i = 0; i < TENSOR_NUM_INPUT; i++)
-    {
-        if(f32_in_buffer[i]) free(f32_in_buffer[i]);
-    }
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
-    {
-        if(f32_out_buffer[i]) free(f32_out_buffer[i]);
-        if(int32_out_buffer[i]) free(int32_out_buffer[i]);
-    }
-    return status;
-} /* _VX_KERNEL_FUNC_KERNEL() */
-
-static vx_param_description_t vxGenerate_proposalsKernelParam[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-
-vx_status VX_CALLBACK vxGenerate_proposalsInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    vx_uint32 paraNum
-    )
-{
-    vx_status status = VX_SUCCESS;
-    /*TODO: Add initial code for VX program*/
-
-    return status;
-}
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t vxGenerate_proposals_CPU =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    _VX_KERNEL_FUNC_KERNEL,
-    vxGenerate_proposalsKernelParam,
-    _cnt_of_array( vxGenerate_proposalsKernelParam ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxGenerate_proposals_VX =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    NULL,
-    vxGenerate_proposalsKernelParam,
-    _cnt_of_array( vxGenerate_proposalsKernelParam ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxGenerate_proposalsInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_GENERATE_PROPOSALS_list[] =
-{
-    &vxGenerate_proposals_CPU,
-    &vxGenerate_proposals_VX,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c
index fa9537a..e464197 100644
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_heatmap_max_keypoint.c
@@ -37,7 +37,7 @@
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "libnnext/vx_lib_nnext.h"
 
 #define _VX_KERNEL_VAR          (vx_kernel_HEATMAP_MAX_KEYPOINT)
diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c
index cc99b85..a63cb15 100644
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_imageprocess.c
@@ -35,7 +35,7 @@
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "libnnext/vx_lib_nnext.h"
 
 #define _VX_KERNEL_VAR          (vx_kernel_IMAGEPROCESS)
diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c
index a473b6e..0cb39a1 100644
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_signalframe.c
@@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "libnnext/vx_lib_nnext.h"
 
 #define INPUT_FP16 0
diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c
index e302139..ffa26dd 100644
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_spatial_transformer.c
@@ -35,7 +35,7 @@
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "libnnext/vx_lib_nnext.h"
 
 #define _VX_KERNEL_VAR          (vx_kernel_SPATIAL_TRANSFORMER)
diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c
index 9378674..0b4805d 100644
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_sync_host.c
@@ -29,7 +29,7 @@
 
 #include "vsi_nn_pub.h"
 #include "utils/vsi_nn_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "libnnext/vx_lib_nnext.h"
 
 #define _VX_KERNEL_VAR_CPU      (vx_client_kernel_cpu_SYNC_HOST)
diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c
index 5d5f1ea..9d2c936 100644
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c
+++ b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_tensorstackconcat.c
@@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "libnnext/vx_lib_nnext.h"
 
 void tensorStackConcatFunc
diff --git a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_topk.c b/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_topk.c
deleted file mode 100644
index 2fdf3bd..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/kernel/vsi_nn_kernel_topk.c
+++ /dev/null
@@ -1,266 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdint.h>
-
-#include "vsi_nn_platform.h"
-
-#include "vsi_nn_prv.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_test.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "utils/vsi_nn_dtype_util.h"
-#include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
-#include "libnnext/vx_lib_nnext.h"
-
-#define _VX_KERNEL_VAR          (vx_kernel_TOPK)
-#define _VX_KERNEL_ID           (VX_KERNEL_ENUM_TOPK)
-#define _VX_KERNEL_NAME         (VX_KERNEL_NAME_TOPK)
-#define _VX_KERNEL_FUNC_KERNEL  (vxTopkKernel)
-
-static uint32_t max_comp_func(void* data, int32_t left, int32_t right)
-{
-    float* fdata = (float*)data;
-    if (fdata[left] >= fdata[right])
-    {
-        return TRUE;
-    }
-    else
-    {
-        return FALSE;
-    }
-}
-
-static void find_top_k_1d
-(
-    float* input,
-    uint32_t input_len,
-    uint32_t k,
-    float* value,
-    uint32_t* indices
-)
-{
-    int32_t low = 0;
-    int32_t high = input_len - 1;
-    int32_t j;
-
-    for (j = 0; j < (int32_t)input_len; j++)
-    {
-        indices[j] = j;
-    }
-
-    j = vsi_nn_partition(input, low, high, max_comp_func, FALSE, indices);
-
-    //part_sort
-    while (j != (int32_t)k)
-    {
-        if ((int32_t)k > j)
-        {
-            low = j + 1;
-        }
-        else
-        {
-            high = j;
-        }
-        j = vsi_nn_partition(input, low, high, max_comp_func, FALSE, indices);
-    }
-    //all_sort
-    vsi_nn_partition(input, 0, k - 1, max_comp_func, TRUE, indices);
-
-    for (j = 0; j < (int32_t)k; j++)
-    {
-        value[j] = input[indices[j]];
-    }
-}
-
-static vsi_status VX_CALLBACK vxTopkKernel
-    (
-    vx_node node,
-    const vx_reference* paramObj,
-    uint32_t paramNum
-    )
-{
-#define ARG_NUM            (1)
-#define TENSOR_NUM_INPUT (1)
-#define TENSOR_NUM_OUTPUT (2)
-#define TENSOR_NUM (TENSOR_NUM_INPUT+TENSOR_NUM_OUTPUT)
-
-    vsi_status status = VSI_FAILURE;
-    vx_context context = NULL;
-    vx_tensor input[TENSOR_NUM_INPUT] = {0};
-    vx_tensor output[TENSOR_NUM_OUTPUT] = {0};
-    float *f32_in_buffer[TENSOR_NUM_INPUT] = {0};
-    float *f32_out_buffer = NULL;
-    uint32_t *u32_out_buffer = NULL;
-    vsi_nn_tensor_attr_t in_attr[TENSOR_NUM_INPUT];
-    vsi_nn_tensor_attr_t out_attr[TENSOR_NUM_OUTPUT];
-    uint32_t in_elements[TENSOR_NUM_INPUT] = {0};
-    uint32_t out_elements[TENSOR_NUM_OUTPUT]= {0};
-
-    int32_t top_k;
-
-    uint32_t i = 0;
-    for(i = 0; i < TENSOR_NUM_INPUT; i++)
-    {
-        memset(&in_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
-    }
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i++)
-    {
-        memset(&out_attr[i], 0x0, sizeof(vsi_nn_tensor_attr_t));
-    }
-    /* prepare data */
-    context = vxGetContext((vx_reference)node);
-
-    for(i = 0; i < TENSOR_NUM_INPUT; i ++)
-    {
-        input[i] = (vx_tensor)paramObj[i];
-        status = vsi_nn_vxGetTensorAttr(input[i], &in_attr[i]);
-        TEST_CHECK_STATUS(status, final);
-        in_elements[i] = vsi_nn_vxGetTensorElementNum(&in_attr[i]);
-        f32_in_buffer[i] = (float *)malloc(in_elements[i] * sizeof(float));
-        status = vsi_nn_vxConvertTensorToFloat32Data(
-            context, input[i], &in_attr[i], f32_in_buffer[i],
-            in_elements[i] * sizeof(float));
-        TEST_CHECK_STATUS(status, final);
-    }
-    for(i = 0; i < TENSOR_NUM_OUTPUT; i ++)
-    {
-        output[i] = (vx_tensor)paramObj[i + TENSOR_NUM_INPUT];
-        status = vsi_nn_vxGetTensorAttr(output[i], &out_attr[i]);
-        TEST_CHECK_STATUS(status, final);
-        out_elements[i] = vsi_nn_vxGetTensorElementNum(&out_attr[i]);
-    }
-    f32_out_buffer = (float *)malloc(out_elements[0] * sizeof(float));
-    u32_out_buffer = (uint32_t *)malloc(out_elements[1] * sizeof(uint32_t));
-    vxCopyScalar((vx_scalar)paramObj[TENSOR_NUM], &(top_k),
-        VX_READ_ONLY, VX_MEMORY_TYPE_HOST);
-
-    /* TODO: Add CPU kernel implement */
-    {
-        uint32_t block_num = in_attr[0].size[1];
-        uint32_t block_size = in_attr[0].size[0];
-        uint32_t * indices = (uint32_t*)malloc(block_size * sizeof(uint32_t));
-
-        for(i = 0; i < block_num; i++)
-        {
-            uint32_t in_index = i * block_size;
-            uint32_t out_index = i * top_k;
-            find_top_k_1d(&(f32_in_buffer[0][in_index]),
-                block_size, top_k, &(f32_out_buffer[out_index]), indices);
-            memcpy(&(u32_out_buffer[out_index]),
-                indices, top_k * sizeof(uint32_t));
-        }
-        // Handle the 1D input
-        if (!block_num) {
-            find_top_k_1d(&(f32_in_buffer[0][0]),
-                block_size, top_k, &(f32_out_buffer[0]), indices);
-            memcpy(&(u32_out_buffer[0]),
-                indices, top_k * sizeof(uint32_t));
-        }
-        if (indices) free(indices);
-    }
-
-    /* save data */
-    status = vsi_nn_vxConvertFloat32DataToTensor(
-            context, output[0], &out_attr[0], f32_out_buffer,
-            out_elements[0] * sizeof(float));
-    TEST_CHECK_STATUS(status, final);
-    vsi_nn_vxCopyDataToTensor(context, output[1], &out_attr[1], (uint8_t *)u32_out_buffer);
-
-final:
-    for (i = 0; i < TENSOR_NUM_INPUT; i++)
-    {
-        if (f32_in_buffer[i]) free(f32_in_buffer[i]);
-    }
-    if (f32_out_buffer) free(f32_out_buffer);
-    if (u32_out_buffer) free(u32_out_buffer);
-    return status;
-} /* _VX_KERNEL_FUNC_KERNEL() */
-
-static vx_param_description_t vxTopkKernelParam[] =
-{
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-
-vx_status VX_CALLBACK vxTopkInitializer
-    (
-    vx_node nodObj,
-    const vx_reference *paramObj,
-    vx_uint32 paraNum
-    )
-{
-    vx_status status = VX_SUCCESS;
-    /*TODO: Add initial code for VX program*/
-
-    return status;
-}
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-vx_kernel_description_t vxTopk_CPU =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    _VX_KERNEL_FUNC_KERNEL,
-    vxTopkKernelParam,
-    _cnt_of_array( vxTopkKernelParam ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vsi_nn_KernelInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t vxTopk_VX =
-{
-    _VX_KERNEL_ID,
-    _VX_KERNEL_NAME,
-    NULL,
-    vxTopkKernelParam,
-    _cnt_of_array( vxTopkKernelParam ),
-    vsi_nn_KernelValidator,
-    NULL,
-    NULL,
-    vxTopkInitializer,
-    vsi_nn_KernelDeinitializer
-};
-
-vx_kernel_description_t * vx_kernel_TOPK_list[] =
-{
-    &vxTopk_CPU,
-    &vxTopk_VX,
-    NULL
-};
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single.vx b/src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single.vx
index 9ac4945..292e86a 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single.vx
@@ -9,7 +9,7 @@ _viv_uniform float output_scale;
 _viv_uniform float output_zp;
 
 #define BATCH_NORM_SH_IMPL(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \
-__kernel void batch_norm_##name0##to##name1##_brdcst1( \
+__kernel void batch_norm_##name0##_F16_F16_F16_F32to##name1##_brdcst1( \
     __read_only  image2d_array_t input, \
     __read_only  image2d_array_t Mean, \
     __read_only  image2d_array_t Variance, \
@@ -73,7 +73,7 @@ BATCH_NORM_SH_IMPL(I8,  I8,  vxc_char16,  vxc_char16,  int4,  vxc_char16,  vxc_c
 BATCH_NORM_SH_IMPL(I8,  F16, vxc_char16,  vxc_char16,  half4, vxc_half8,   vxc_ushort8)
 
 #define BATCH_NORM_SH_IMPL_2D(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \
-__kernel void batch_norm_##name0##to##name1##_brdcst1_2D( \
+__kernel void batch_norm_##name0##_F16_F16_F16_F32to##name1##_brdcst1_2D( \
     __read_only  image2d_array_t input, \
     __read_only  image2d_t       Mean, \
     __read_only  image2d_t       Variance, \
@@ -138,7 +138,7 @@ BATCH_NORM_SH_IMPL_2D(I8,  F16, vxc_char16,  vxc_char16,  half4, vxc_half8,   vx
 
 
 #define BATCH_NORM_SH_IMPL_AXIS1(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \
-__kernel void batch_norm_##name0##to##name1##_brdcst0( \
+__kernel void batch_norm_##name0##_F16_F16_F16_F32to##name1##_brdcst0( \
     __read_only  image2d_array_t input, \
     __read_only  image2d_array_t Mean, \
     __read_only  image2d_array_t Variance, \
@@ -205,7 +205,7 @@ BATCH_NORM_SH_IMPL_AXIS1(I8,  I8,  vxc_char16,  vxc_char16,  int4,  vxc_char16,
 BATCH_NORM_SH_IMPL_AXIS1(I8,  F16, vxc_char16,  vxc_char16,  half4, vxc_half8,   vxc_ushort8)
 
 #define BATCH_NORM_SH_IMPL_AXIS1_2D(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \
-__kernel void batch_norm_##name0##to##name1##_brdcst0_2D( \
+__kernel void batch_norm_##name0##_F16_F16_F16_F32to##name1##_brdcst0_2D( \
     __read_only  image2d_array_t input, \
     __read_only  image2d_t       Mean, \
     __read_only  image2d_t       Variance, \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single_f32.vx b/src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single_f32.vx
new file mode 100644
index 0000000..e419457
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/batchnorm_single_f32.vx
@@ -0,0 +1,267 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniDatatoF32_0_4x4;
+_viv_uniform VXC_512Bits uniDatatoF32_1_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform float input_scale;
+_viv_uniform float input_tail;
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+
+#define BATCH_NORM_SH_IMPL(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \
+__kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst1( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_array_t Mean, \
+    __read_only  image2d_array_t Variance, \
+    __read_only  image2d_array_t Gamma, \
+    __read_only  image2d_array_t Beta, \
+    __write_only image2d_array_t output, \
+                 float           eps \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    read_type vec; \
+    src_type src; \
+    VXC_ReadImage2DArray(vec, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src, vec, 16); \
+    vxc_ushort8 _mean, _var; \
+    vxc_half8 mean, var; \
+    VXC_ReadImage2DArray(_mean, Mean, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, mean, _mean, 16); \
+    VXC_ReadImage2DArray(_var, Variance, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, var, _var, 16); \
+    float4 gamma0 = read_imagef(Gamma, coord); \
+    coord.x += 4; \
+    float4 gamma1 = read_imagef(Gamma, coord); \
+    coord.x -= 4; \
+    float4 beta = read_imagef(Beta, coord); \
+ \
+    float4 src0, src1, m, v; \
+    VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
+    VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
+    VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
+    gamma0 = gamma0 * rsqrt(v + eps); \
+    src0 = src0 * input_scale + input_tail; \
+    src0 = (src0 - m) * gamma0 + beta.xxxx; \
+    src0 = src0 * output_scale + output_zp; \
+    VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
+    VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
+    VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
+    gamma1 = gamma1 * rsqrt(v + eps); \
+    src1 = src1 * input_scale + input_tail; \
+    src1 = (src1 - m) * gamma1 + beta.xxxx; \
+    src1 = src1 * output_scale + output_zp; \
+ \
+    conv_type dst0, dst1; \
+    _viv_asm(CONV_RTE, dst0, src0); \
+    _viv_asm(CONV_RTE, dst1, src1); \
+    dst_type tmp; \
+    save_type dst; \
+    VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, tmp, 16); \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+BATCH_NORM_SH_IMPL(F16, F16, vxc_half8,   vxc_ushort8, half4, vxc_half8,   vxc_ushort8)
+BATCH_NORM_SH_IMPL(F16, I16, vxc_half8,   vxc_ushort8, int4,  vxc_short8,  vxc_short8)
+BATCH_NORM_SH_IMPL(F16, U8,  vxc_half8,   vxc_ushort8, int4,  vxc_uchar16, vxc_uchar16)
+BATCH_NORM_SH_IMPL(F16, I8,  vxc_half8,   vxc_ushort8, int4,  vxc_char16,  vxc_char16)
+BATCH_NORM_SH_IMPL(I16, I16, vxc_short8,  vxc_short8,  int4,  vxc_short8,  vxc_short8)
+BATCH_NORM_SH_IMPL(I16, F16, vxc_short8,  vxc_short8,  half4, vxc_half8,   vxc_ushort8)
+BATCH_NORM_SH_IMPL(U8,  U8,  vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16, vxc_uchar16)
+BATCH_NORM_SH_IMPL(U8,  F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8,   vxc_ushort8)
+BATCH_NORM_SH_IMPL(I8,  I8,  vxc_char16,  vxc_char16,  int4,  vxc_char16,  vxc_char16)
+BATCH_NORM_SH_IMPL(I8,  F16, vxc_char16,  vxc_char16,  half4, vxc_half8,   vxc_ushort8)
+
+#define BATCH_NORM_SH_IMPL_2D(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \
+__kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst1_2D( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       Mean, \
+    __read_only  image2d_t       Variance, \
+    __read_only  image2d_t       Gamma, \
+    __read_only  image2d_t       Beta, \
+    __write_only image2d_array_t output, \
+                 float           eps \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    read_type vec; \
+    src_type src; \
+    VXC_ReadImage(vec, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src, vec, 16); \
+    coord.z = coord.x + 4; \
+    vxc_ushort8 _mean, _var; \
+    vxc_half8 mean, var; \
+    VXC_ReadImage(_mean, Mean, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, mean, _mean, 16); \
+    VXC_ReadImage(_var, Variance, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, var, _var, 16); \
+    float4 gamma0 = read_imagef(Gamma, coord.xy); \
+    float4 gamma1 = read_imagef(Gamma, coord.zy); \
+    float4 beta = read_imagef(Beta, coord.xy); \
+ \
+    float4 src0, src1, m, v; \
+    VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
+    VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
+    VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
+    gamma0 = gamma0 * rsqrt(v + eps); \
+    src0 = src0 * input_scale + input_tail; \
+    src0 = (src0 - m) * gamma0 + beta.xxxx; \
+    src0 = src0 * output_scale + output_zp; \
+    VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
+    VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
+    VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
+    gamma1 = gamma1 * rsqrt(v + eps); \
+    src1 = src1 * input_scale + input_tail; \
+    src1 = (src1 - m) * gamma1 + beta.xxxx; \
+    src1 = src1 * output_scale + output_zp; \
+ \
+    conv_type dst0, dst1; \
+    _viv_asm(CONV_RTE, dst0, src0); \
+    _viv_asm(CONV_RTE, dst1, src1); \
+    dst_type tmp; \
+    save_type dst; \
+    VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, tmp, 16); \
+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+BATCH_NORM_SH_IMPL_2D(F16, F16, vxc_half8,   vxc_ushort8, half4, vxc_half8,   vxc_ushort8)
+BATCH_NORM_SH_IMPL_2D(F16, I16, vxc_half8,   vxc_ushort8, int4,  vxc_short8,  vxc_short8)
+BATCH_NORM_SH_IMPL_2D(F16, U8,  vxc_half8,   vxc_ushort8, int4,  vxc_uchar16, vxc_uchar16)
+BATCH_NORM_SH_IMPL_2D(F16, I8,  vxc_half8,   vxc_ushort8, int4,  vxc_char16,  vxc_char16)
+BATCH_NORM_SH_IMPL_2D(I16, I16, vxc_short8,  vxc_short8,  int4,  vxc_short8,  vxc_short8)
+BATCH_NORM_SH_IMPL_2D(I16, F16, vxc_short8,  vxc_short8,  half4, vxc_half8,   vxc_ushort8)
+BATCH_NORM_SH_IMPL_2D(U8,  U8,  vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16, vxc_uchar16)
+BATCH_NORM_SH_IMPL_2D(U8,  F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8,   vxc_ushort8)
+BATCH_NORM_SH_IMPL_2D(I8,  I8,  vxc_char16,  vxc_char16,  int4,  vxc_char16,  vxc_char16)
+BATCH_NORM_SH_IMPL_2D(I8,  F16, vxc_char16,  vxc_char16,  half4, vxc_half8,   vxc_ushort8)
+
+
+#define BATCH_NORM_SH_IMPL_AXIS1(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \
+__kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst0( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_array_t Mean, \
+    __read_only  image2d_array_t Variance, \
+    __read_only  image2d_array_t Gamma, \
+    __read_only  image2d_array_t Beta, \
+    __write_only image2d_array_t output, \
+                 float           eps \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    read_type vec; \
+    src_type src; \
+    VXC_ReadImage2DArray(vec, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src, vec, 16); \
+    vxc_ushort8 _mean, _var; \
+    vxc_half8 mean, var; \
+    VXC_ReadImage2DArray(_mean, Mean, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, mean, _mean, 16); \
+    VXC_ReadImage2DArray(_var, Variance, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, var, _var, 16); \
+    float4 gamma0 = read_imagef(Gamma, coord); \
+    float4 beta0 = read_imagef(Beta, coord); \
+    coord.x += 4; \
+    float4 gamma1 = read_imagef(Gamma, coord); \
+    float4 beta1 = read_imagef(Beta, coord); \
+    coord.x -= 4; \
+ \
+    float4 src0, src1, m, v; \
+    VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
+    VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
+    VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
+    gamma0 = gamma0 * rsqrt(v + eps); \
+    src0 = src0 * input_scale + input_tail; \
+    src0 = (src0 - m) * gamma0 + beta0; \
+    src0 = src0 * output_scale + output_zp; \
+    VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
+    VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
+    VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
+    gamma1 = gamma1 * rsqrt(v + eps); \
+    src1 = src1 * input_scale + input_tail; \
+    src1 = (src1 - m) * gamma1 + beta1; \
+    src1 = src1 * output_scale + output_zp; \
+ \
+    conv_type dst0, dst1; \
+    _viv_asm(CONV_RTE, dst0, src0); \
+    _viv_asm(CONV_RTE, dst1, src1); \
+    dst_type tmp; \
+    save_type dst; \
+    VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, tmp, 16); \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+BATCH_NORM_SH_IMPL_AXIS1(F16, F16, vxc_half8,   vxc_ushort8, half4, vxc_half8,   vxc_ushort8)
+BATCH_NORM_SH_IMPL_AXIS1(F16, I16, vxc_half8,   vxc_ushort8, int4,  vxc_short8,  vxc_short8)
+BATCH_NORM_SH_IMPL_AXIS1(F16, U8,  vxc_half8,   vxc_ushort8, int4,  vxc_uchar16, vxc_uchar16)
+BATCH_NORM_SH_IMPL_AXIS1(F16, I8,  vxc_half8,   vxc_ushort8, int4,  vxc_char16,  vxc_char16)
+BATCH_NORM_SH_IMPL_AXIS1(I16, I16, vxc_short8,  vxc_short8,  int4,  vxc_short8,  vxc_short8)
+BATCH_NORM_SH_IMPL_AXIS1(I16, F16, vxc_short8,  vxc_short8,  half4, vxc_half8,   vxc_ushort8)
+BATCH_NORM_SH_IMPL_AXIS1(U8,  U8,  vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16, vxc_uchar16)
+BATCH_NORM_SH_IMPL_AXIS1(U8,  F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8,   vxc_ushort8)
+BATCH_NORM_SH_IMPL_AXIS1(I8,  I8,  vxc_char16,  vxc_char16,  int4,  vxc_char16,  vxc_char16)
+BATCH_NORM_SH_IMPL_AXIS1(I8,  F16, vxc_char16,  vxc_char16,  half4, vxc_half8,   vxc_ushort8)
+
+#define BATCH_NORM_SH_IMPL_AXIS1_2D(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \
+__kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst0_2D( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       Mean, \
+    __read_only  image2d_t       Variance, \
+    __read_only  image2d_t       Gamma, \
+    __read_only  image2d_t       Beta, \
+    __write_only image2d_array_t output, \
+                 float           eps \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
+    read_type vec; \
+    src_type src; \
+    VXC_ReadImage(vec, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src, vec, 16); \
+    coord.z += 4; \
+    vxc_ushort8 _mean, _var; \
+    vxc_half8 mean, var; \
+    VXC_ReadImage(_mean, Mean, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, mean, _mean, 16); \
+    VXC_ReadImage(_var, Variance, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, var, _var, 16); \
+    float4 gamma0 = read_imagef(Gamma, coord.xy); \
+    float4 gamma1 = read_imagef(Gamma, coord.zy); \
+    float4 beta0 = read_imagef(Beta, coord.xy); \
+    float4 beta1 = read_imagef(Beta, coord.zy); \
+ \
+    float4 src0, src1, m, v; \
+    VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
+    VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
+    VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \
+    gamma0 = gamma0 * rsqrt(v + eps); \
+    src0 = src0 * input_scale + input_tail; \
+    src0 = (src0 - m) * gamma0 + beta0; \
+    src0 = src0 * output_scale + output_zp; \
+    VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
+    VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
+    VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \
+    gamma1 = gamma1 * rsqrt(v + eps); \
+    src1 = src1 * input_scale + input_tail; \
+    src1 = (src1 - m) * gamma1 + beta1; \
+    src1 = src1 * output_scale + output_zp; \
+ \
+    conv_type dst0, dst1; \
+    _viv_asm(CONV_RTE, dst0, src0); \
+    _viv_asm(CONV_RTE, dst1, src1); \
+    dst_type tmp; \
+    save_type dst; \
+    VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, tmp, 16); \
+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+BATCH_NORM_SH_IMPL_AXIS1_2D(F16, F16, vxc_half8,   vxc_ushort8, half4, vxc_half8,   vxc_ushort8)
+BATCH_NORM_SH_IMPL_AXIS1_2D(F16, I16, vxc_half8,   vxc_ushort8, int4,  vxc_short8,  vxc_short8)
+BATCH_NORM_SH_IMPL_AXIS1_2D(F16, U8,  vxc_half8,   vxc_ushort8, int4,  vxc_uchar16, vxc_uchar16)
+BATCH_NORM_SH_IMPL_AXIS1_2D(F16, I8,  vxc_half8,   vxc_ushort8, int4,  vxc_char16,  vxc_char16)
+BATCH_NORM_SH_IMPL_AXIS1_2D(I16, I16, vxc_short8,  vxc_short8,  int4,  vxc_short8,  vxc_short8)
+BATCH_NORM_SH_IMPL_AXIS1_2D(I16, F16, vxc_short8,  vxc_short8,  half4, vxc_half8,   vxc_ushort8)
+BATCH_NORM_SH_IMPL_AXIS1_2D(U8,  U8,  vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16, vxc_uchar16)
+BATCH_NORM_SH_IMPL_AXIS1_2D(U8,  F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8,   vxc_ushort8)
+BATCH_NORM_SH_IMPL_AXIS1_2D(I8,  I8,  vxc_char16,  vxc_char16,  int4,  vxc_char16,  vxc_char16)
+BATCH_NORM_SH_IMPL_AXIS1_2D(I8,  F16, vxc_char16,  vxc_char16,  half4, vxc_half8,   vxc_ushort8)
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/conv1d_ovxlib.vx b/src/tim/vx/internal/src/libnnext/ops/vx/conv1d_ovxlib.vx
new file mode 100644
index 0000000..6c67421
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/conv1d_ovxlib.vx
@@ -0,0 +1,151 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniConv1DK3_Lo0_4x4;
+_viv_uniform VXC_512Bits uniConv1DK3_Lo1_4x4;
+_viv_uniform VXC_512Bits uniConv1DK3_Lo2_4x4;
+_viv_uniform VXC_512Bits uniConv1DK3_Hi0_4x4;
+_viv_uniform VXC_512Bits uniConv1DK3_Hi1_4x4;
+_viv_uniform VXC_512Bits uniConv1DK3_Hi2_4x4;
+_viv_uniform VXC_512Bits uniDataConvK3_2x8;
+_viv_uniform VXC_512Bits uniSumOrderUchar_2x8;
+
+_viv_uniform int input_ZP;
+_viv_uniform int weight_ZP;
+_viv_uniform float output_ZP;
+_viv_uniform float scaleOut;
+_viv_uniform int input_height;
+
+__kernel void conv1d_U8U8I32toU8_K3_S1(
+     __read_only image2d_array_t   input,
+     __read_only image2d_array_t   weight,
+     __read_only image2d_t         bias,
+    __write_only image2d_array_t   output,
+                 int               stride,
+                 int               pad_front,
+                 int               pad_end,
+                 int               dilation,
+                 int               overflow_policy)
+{
+    int4 coord   = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_w = (int4)(0, 0, get_global_id(1), 0);
+    float4 sum0, sum1, dst;
+    vxc_short8 weight_val_s =(short)input_ZP;
+    vxc_uchar16 input_val = 0, weight_val = 0;
+    int temp = 0, i;
+
+    temp = read_imagei(bias, coord.yz).x;
+    sum0 = convert_float(temp);
+    sum1 = sum0;
+    weight_val_s.s5 = (short)weight_ZP;
+
+    for (i = 0; i < input_height; i++)
+    {
+        VXC_ReadImage2DArray(weight_val, weight, coord_w, VXC_5BITOFFSET_XY(0, 0), \
+                             VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0));
+        VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(0, 0), \
+                             VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_DP2x8(weight_val_s, weight_val, weight_val_s, \
+                             VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0), uniDataConvK3_2x8);
+
+        VXC_DP4x4(dst, input_val, weight_val_s, \
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo0_4x4);
+        sum0 += dst;
+        VXC_DP4x4(dst, input_val, weight_val_s, \
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi0_4x4);
+        sum1 += dst;
+        coord.x += dilation;
+        VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(0, 0), \
+                     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_DP4x4(dst, input_val, weight_val_s, \
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo1_4x4);
+        sum0 += dst;
+        VXC_DP4x4(dst, input_val, weight_val_s, \
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi1_4x4);
+        sum1 += dst;
+        coord.x += dilation;
+        VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(0, 0), \
+                     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_DP4x4(dst, input_val, weight_val_s, \
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo2_4x4);
+        sum0 += dst;
+        VXC_DP4x4(dst, input_val, weight_val_s, \
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi2_4x4);
+        sum1 += dst;
+        coord_w.y++;
+        coord.z++;
+        coord.x = get_global_id(0);
+    }
+
+    sum0 = sum0 * scaleOut + output_ZP;
+    sum1 = sum1 * scaleOut + output_ZP;
+    uchar4 result0, result1;
+    _viv_asm(CONV_SAT_RTE, result0, sum0);
+    _viv_asm(CONV_SAT_RTE, result1, sum1);
+    vxc_uchar8 result;
+    VXC_DP2x8(result, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumOrderUchar_2x8);
+    VXC_WriteImage(output, coord.xy, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void conv1d_U8U8I32toU8_K3_S1_D2_D4(
+     __read_only image2d_array_t   input,
+     __read_only image2d_array_t   weight,
+     __read_only image2d_t         bias,
+    __write_only image2d_array_t   output,
+                 int               stride,
+                 int               pad_front,
+                 int               pad_end,
+                 int               dilation,
+                 int               overflow_policy)
+{
+    int4 coord   = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_w = (int4)(0, 0, get_global_id(1), 0);
+    float4 sum0, sum1, dst;
+    vxc_short8 weight_val_s =(short)input_ZP;
+    vxc_uchar16 input_val = 0, weight_val = 0;
+    int temp = 0, i;
+
+    temp = read_imagei(bias, coord.yz).x;
+    sum0 = convert_float(temp);
+    sum1 = sum0;
+    weight_val_s.s5 = (short)weight_ZP;
+
+    for (i = 0; i < input_height; i++)
+    {
+        VXC_ReadImage2DArray(weight_val, weight, coord_w, VXC_5BITOFFSET_XY(0, 0), \
+                             VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0));
+        VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(0, 0), \
+                             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_DP2x8(weight_val_s, weight_val, weight_val_s, \
+                             VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0), uniDataConvK3_2x8);
+
+        VXC_DP4x4(dst, input_val, weight_val_s, \
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo0_4x4);
+        sum0 += dst;
+        VXC_DP4x4(dst, input_val, weight_val_s, \
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi0_4x4);
+        sum1 += dst;
+        VXC_DP4x4(dst, input_val, weight_val_s, \
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo1_4x4);
+        sum0 += dst;
+        VXC_DP4x4(dst, input_val, weight_val_s, \
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi1_4x4);
+        sum1 += dst;
+        VXC_DP4x4(dst, input_val, weight_val_s, \
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo2_4x4);
+        sum0 += dst;
+        VXC_DP4x4(dst, input_val, weight_val_s, \
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi2_4x4);
+        sum1 += dst;
+        coord_w.y++;
+        coord.z++;
+    }
+
+    sum0 = sum0 * scaleOut + output_ZP;
+    sum1 = sum1 * scaleOut + output_ZP;
+    uchar4 result0, result1;
+    _viv_asm(CONV_SAT_RTE, result0, sum0);
+    _viv_asm(CONV_SAT_RTE, result1, sum1);
+    vxc_uchar8 result;
+    VXC_DP2x8(result, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumOrderUchar_2x8);
+    VXC_WriteImage(output, coord.xy, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/conv1d_ovxlib_k1024.vx b/src/tim/vx/internal/src/libnnext/ops/vx/conv1d_ovxlib_k1024.vx
new file mode 100644
index 0000000..f6ac4ce
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/conv1d_ovxlib_k1024.vx
@@ -0,0 +1,167 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniU8SubZp_lo_2x8;
+_viv_uniform VXC_512Bits uniU8SubZp_hi_2x8;
+_viv_uniform VXC_512Bits uniU8Conv1d_part0_8x2;
+_viv_uniform VXC_512Bits uniU8Conv1d_part1_8x2;
+_viv_uniform VXC_512Bits uniU8Conv1d_part2_8x2;
+_viv_uniform VXC_512Bits uniU8Conv1d_part3_8x2;
+_viv_uniform VXC_512Bits uniSumOrderUchar_2x8;
+
+_viv_uniform int kernel_cnt_x16;
+_viv_uniform int weight_ZP;
+_viv_uniform float output_ZP;
+_viv_uniform float scaleOut;
+_viv_uniform int input_height;
+_viv_uniform int input_width;
+_viv_uniform int output_width;
+
+__kernel void conv1d_U8U8I32toU8_K1024_SMALL(
+     __read_only image2d_array_t   input,
+     __read_only image2d_array_t   weight,
+     __read_only image2d_t         bias,
+    __write_only image2d_array_t   output,
+                 int               stride,
+                 int               pad_front,
+                 int               pad_end,
+                 int               dilation,
+                 int               overflow_policy)
+{
+    int  start_x = get_global_id(0) - pad_front;
+    int4 coord   = (int4)(start_x, get_global_id(1), 0, get_global_id(0));
+    int4 coord_w = (int4)(0, 0, get_global_id(1), 0);
+    float4 sum0, sum1, dst;
+    vxc_short8 coef;
+    vxc_short8 w_zp = (short)weight_ZP;
+    vxc_uchar16 input_val = 0, weight_val = 0;
+    int temp = 0, i, j;
+
+    temp = read_imagei(bias, coord.yz).x;
+    sum0 = convert_float(temp);
+    sum1 = sum0;
+
+    for (i = 0; i < input_height; i++)
+    {
+        for (j = 0; j < kernel_cnt_x16; j++)
+        {
+            VXC_ReadImage2DArray(weight_val, weight, coord_w, VXC_5BITOFFSET_XY(0, 0), \
+                                 VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+            VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(0, 0), \
+                                 VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+            VXC_DP2x8(coef, weight_val, w_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part0_8x2);
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part1_8x2);
+            sum0 += dst;
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part2_8x2);
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part3_8x2);
+            sum1 += dst;
+            VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(8, 0), \
+                                 VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+            VXC_DP2x8(coef, weight_val, w_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part0_8x2);
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part1_8x2);
+            sum0 += dst;
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part2_8x2);
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part3_8x2);
+            sum1 += dst;
+            coord_w.x += 16;
+            coord.x += 16;
+        }
+        coord_w.x = 0;
+        coord_w.y++;
+        coord.z++;
+        coord.x = start_x;
+    }
+
+    sum0 = sum0 * scaleOut + output_ZP;
+    sum1 = sum1 * scaleOut + output_ZP;
+    uchar4 result0, result1;
+    _viv_asm(CONV_SAT_RTE, result0, sum0);
+    _viv_asm(CONV_SAT_RTE, result1, sum1);
+    vxc_uchar8 result;
+    VXC_DP2x8(result, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumOrderUchar_2x8);
+    VXC_WriteImage(output, coord.wy, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+inline uchar* get_image2D_array_ptr(image2d_array_t  input)
+{
+    int8 desc;
+    _viv_asm(COPY, desc, input, sizeof(desc));
+    uchar *src_ptr = (uchar*)desc.s0;
+    return src_ptr;
+}
+
+__kernel void conv1d_U8U8I32toU8_K1024_LARGE(
+     __read_only image2d_array_t   input,
+     __read_only image2d_array_t   weight,
+     __read_only image2d_t         bias,
+    __write_only image2d_array_t   output,
+                 int               stride,
+                 int               pad_front,
+                 int               pad_end,
+                 int               dilation,
+                 int               overflow_policy)
+{
+    int  start_x = get_global_id(0);
+    int  w_left  = output_width - start_x;
+    int  out_x   = w_left < 8 ? get_global_id(0) - w_left : get_global_id(0);
+    int4 coord   = (int4)(start_x, get_global_id(1), 0, out_x);
+    int4 coord_w = (int4)(0, 0, get_global_id(1), 0);
+    float4 sum0, sum1, dst;
+    vxc_short8 coef;
+    vxc_short8 w_zp = (short)weight_ZP;
+    vxc_uchar16 input_val = 0, weight_val = 0;
+    int temp = 0, i, j;
+    uchar *src_ptr_base = (uchar *)get_image2D_array_ptr(input);
+    uchar *src_ptr;
+    uchar *dst_ptr = (uchar *)get_image2D_array_ptr(output);
+
+    temp = read_imagei(bias, coord.yz).x;
+    sum0 = convert_float(temp);
+    sum1 = sum0;
+
+    for (i = 0; i < input_height; i++)
+    {
+        src_ptr = src_ptr_base + (coord.x + coord.z * input_width);
+        for (j = 0; j < kernel_cnt_x16; j++)
+        {
+            VXC_ReadImage2DArray(weight_val, weight, coord_w, VXC_5BITOFFSET_XY(0, 0), \
+                                 VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+            VXC_Vload16(input_val, src_ptr, 0);
+            VXC_DP2x8(coef, weight_val, w_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part0_8x2);
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part1_8x2);
+            sum0 += dst;
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part2_8x2);
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part3_8x2);
+            sum1 += dst;
+            src_ptr += 8;
+            VXC_Vload16(input_val, src_ptr, 0);
+            VXC_DP2x8(coef, weight_val, w_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part0_8x2);
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part1_8x2);
+            sum0 += dst;
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part2_8x2);
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part3_8x2);
+            sum1 += dst;
+            coord_w.x += 16;
+            coord.x += 16;
+            src_ptr += 8;
+        }
+        coord_w.x = 0;
+        coord_w.y++;
+        coord.z++;
+        coord.x = start_x;
+    }
+
+    sum0 = sum0 * scaleOut + output_ZP;
+    sum1 = sum1 * scaleOut + output_ZP;
+    uchar4 result0, result1;
+    _viv_asm(CONV_SAT_RTE, result0, sum0);
+    _viv_asm(CONV_SAT_RTE, result1, sum1);
+    vxc_uchar8 result;
+    VXC_DP2x8(result, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumOrderUchar_2x8);
+    dst_ptr = dst_ptr + (coord.w + coord.y * output_width);
+    VXC_Vstore8(dst_ptr, 0, result);
+}
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/depth2space_crd.vx b/src/tim/vx/internal/src/libnnext/ops/vx/depth2space_crd.vx
index 0d4ac70..a5612b4 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/depth2space_crd.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/depth2space_crd.vx
@@ -2,21 +2,24 @@
 
 _viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
 _viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_ExLo_2x8;
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_ExHi_2x8;
+_viv_uniform VXC_512Bits uniDepth2SpaceF16Blk2_lo_2x8;
+_viv_uniform VXC_512Bits uniDepth2SpaceF16Blk2_hi_2x8;
+
 
 #define DEPTH2SPACE_CRD_QINT_TO_QINT(src0_type_name, src1_type_name, read_type, write_type) \
 __kernel void depth2space_crd_##src0_type_name##to##src1_type_name( \
-    image2d_array_t input, \
-    image2d_array_t output, \
-    int block_size \
-    ) \
+    image2d_array_t input, image2d_array_t output, int block_size) \
 { \
     int gidx = get_global_id(0); \
     int gidy = get_global_id(1); \
     int gidz = get_global_id(2); \
     int4 coord_out = (int4)(gidx, gidy, gidz, 0); \
     int block_e2 = block_size * block_size; \
-    int inx = gidx / block_size; \
-    int iny = gidy / block_size; \
+    ushort blk = (ushort)block_size; \
+    int inx = (int)((ushort)gidx / blk); \
+    int iny = (int)((ushort)gidy / blk); \
     int inz = (gidx  % block_size) + (gidy % block_size) * block_size + gidz * block_e2; \
     int4 coord_in = (int4)(inx, iny, inz, 0); \
     read_type src; \
@@ -33,18 +36,16 @@ DEPTH2SPACE_CRD_QINT_TO_QINT(I8, I8, vxc_char16, vxc_char16)
 DEPTH2SPACE_CRD_QINT_TO_QINT(I16, I16, vxc_short8, vxc_short8)
 
 __kernel void depth2space_crd_F16toF16(
-    image2d_array_t input,
-    image2d_array_t output,
-    int block_size
-    )
+    image2d_array_t input, image2d_array_t output, int block_size)
 {
     int gidx = get_global_id(0);
     int gidy = get_global_id(1);
     int gidz = get_global_id(2);
     int4 coord_out = (int4)(gidx, gidy, gidz, 0);
     int block_e2 = block_size * block_size;
-    int inx = gidx / block_size;
-    int iny = gidy / block_size;
+    ushort blk = (ushort)block_size;
+    int inx = (int)((ushort)gidx / blk);
+    int iny = (int)((ushort)gidy / blk);
     int inz = (gidx  % block_size) + (gidy % block_size) * block_size + gidz * block_e2;
     int4 coord_in = (int4)(inx, iny, inz, 0);
     vxc_short8 data;
@@ -54,18 +55,16 @@ __kernel void depth2space_crd_F16toF16(
 
 #define DEPTH2SPACE_CRD_QINT_TO_F16(src0_type_name, read_type) \
 __kernel void depth2space_crd_##src0_type_name##toF16( \
-    image2d_array_t input, \
-    image2d_array_t output, \
-    int block_size \
-    ) \
+    image2d_array_t input, image2d_array_t output, int block_size) \
 { \
     int gidx = get_global_id(0); \
     int gidy = get_global_id(1); \
     int gidz = get_global_id(2); \
     int4 coord_out = (int4)(gidx, gidy, gidz, 0); \
     int block_e2 = block_size * block_size; \
-    int inx = gidx / block_size; \
-    int iny = gidy / block_size; \
+    ushort blk = (ushort)block_size; \
+    int inx = (int)((ushort)gidx / blk); \
+    int iny = (int)((ushort)gidy / blk); \
     int inz = (gidx  % block_size) + (gidy % block_size) * block_size + gidz * block_e2; \
     int4 coord_in = (int4)(inx, iny, inz, 0); \
     read_type src; \
@@ -85,18 +84,16 @@ DEPTH2SPACE_CRD_QINT_TO_F16(I16, vxc_short8)
 
 #define DEPTH2SPACE_CRD_F16_TO_QINT(src1_type_name, write_type) \
 __kernel void depth2space_crd_F16to##src1_type_name( \
-    image2d_array_t input, \
-    image2d_array_t output, \
-    int block_size \
-    ) \
+    image2d_array_t input, image2d_array_t output, int block_size) \
 { \
     int gidx = get_global_id(0); \
     int gidy = get_global_id(1); \
     int gidz = get_global_id(2); \
     int4 coord_out = (int4)(gidx, gidy, gidz, 0); \
     int block_e2 = block_size * block_size; \
-    int inx = gidx / block_size; \
-    int iny = gidy / block_size; \
+    ushort blk = (ushort)block_size; \
+    int inx = (int)((ushort)gidx / blk); \
+    int iny = (int)((ushort)gidy / blk); \
     int inz = (gidx  % block_size) + (gidy % block_size) * block_size + gidz * block_e2; \
     int4 coord_in = (int4)(inx, iny, inz, 0); \
     vxc_short8 src; \
@@ -112,4 +109,199 @@ __kernel void depth2space_crd_F16to##src1_type_name( \
 }
 DEPTH2SPACE_CRD_F16_TO_QINT(U8, vxc_uchar16)
 DEPTH2SPACE_CRD_F16_TO_QINT(I8, vxc_char16)
-DEPTH2SPACE_CRD_F16_TO_QINT(I16, vxc_short8)
\ No newline at end of file
+DEPTH2SPACE_CRD_F16_TO_QINT(I16, vxc_short8)
+
+#define DEPTH2SPACE_CRD_QINT_TO_QINT_BLK2(src0_type_name, src1_type_name, read_type, write_type) \
+__kernel void depth2space_crd_##src0_type_name##to##src1_type_name##_blk2( \
+    image2d_array_t input, image2d_array_t output, int block_size) \
+{ \
+    int gidy = get_global_id(1); \
+    int gidz = get_global_id(2); \
+    int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz); \
+    int4 coord_in = coord_out >> 1; \
+    coord_in.z = (gidy & 1) * 2 + gidz * 4; \
+    coord_in.w = coord_in.z + 1; \
+    read_type src; \
+    VXC_ReadImage2DArray(src, input, coord_in.xyzz, \
+                VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(src, input, coord_in.xyww, \
+                VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    write_type  dst; \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    VXC_DP2x8(dst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_ExLo_2x8); \
+    VXC_DP2x8(dst, src, ms0, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_ExHi_2x8); \
+    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+}
+DEPTH2SPACE_CRD_QINT_TO_QINT_BLK2(U8, U8, vxc_uchar16, vxc_uchar16)
+DEPTH2SPACE_CRD_QINT_TO_QINT_BLK2(I8, I8, vxc_char16, vxc_char16)
+
+__kernel void depth2space_crd_F16toF16_blk2(
+    image2d_array_t input, image2d_array_t output, int block_size)
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz);
+    int4 coord_in = coord_out >> 1;
+    coord_in.z = (gidy & 1) * 2 + gidz * 4;
+    coord_in.w = coord_in.z + 1;
+    vxc_short8 data0, data1, dst0, dst1;
+    VXC_ReadImage2DArray(data0, input, coord_in.xyzz,
+             VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(data1, input, coord_in.xyww,
+             VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_DP2x8(dst0, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_lo_2x8);
+    VXC_DP2x8(dst1, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_hi_2x8);
+
+    VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    coord_out.x += 8;
+    VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void depth2space_crd_I16toI16_blk2(
+    image2d_array_t input, image2d_array_t output, int block_size)
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz);
+    int4 coord_in = coord_out >> 1;
+    coord_in.z = (gidy & 1) * 2 + gidz * 4;
+    coord_in.w = coord_in.z + 1;
+    vxc_short8 src0, src1, data0, data1, dst0, dst1;
+    VXC_ReadImage2DArray(src0, input, coord_in.xyzz,
+             VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src1, input, coord_in.xyww,
+             VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_DP2x8(data0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_lo_2x8);
+    VXC_DP2x8(data1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_hi_2x8);
+
+    vxc_ushort8 ms0;
+    _viv_asm(COPY, ms0, multAndoutZP0, 16);
+    VXC_DP2x8(dst0, data0, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8);
+    VXC_DP2x8(dst1, data1, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8);
+
+    VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    coord_out.x += 8;
+    VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+#define DEPTH2SPACE_CRD_QINT_TO_F16_BLK2(src0_type_name, read_type) \
+__kernel void depth2space_crd_##src0_type_name##toF16_blk2( \
+    image2d_array_t input, image2d_array_t output, int block_size) \
+{ \
+    int gidy = get_global_id(1); \
+    int gidz = get_global_id(2); \
+    int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz); \
+    int4 coord_in = coord_out >> 1; \
+    coord_in.z = (gidy & 1) * 2 + gidz * 4; \
+    coord_in.w = coord_in.z + 1; \
+    read_type src; \
+    VXC_ReadImage2DArray(src, input, coord_in.xyzz, \
+                VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(src, input, coord_in.xyww, \
+                VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_half8  tmpDst0, tmpDst1; \
+    vxc_short8  dst0, dst1; \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    VXC_DP2x8(tmpDst0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_ExLo_2x8); \
+    VXC_DP2x8(tmpDst1, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_ExHi_2x8); \
+    _viv_asm(COPY, dst0, tmpDst0, 16); \
+    _viv_asm(COPY, dst1, tmpDst1, 16); \
+    VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    coord_out.x+=8; \
+    VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+DEPTH2SPACE_CRD_QINT_TO_F16_BLK2(U8, vxc_uchar16)
+DEPTH2SPACE_CRD_QINT_TO_F16_BLK2(I8, vxc_char16)
+
+__kernel void depth2space_crd_I16toF16_blk2(
+    image2d_array_t input, image2d_array_t output, int block_size)
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz);
+    int4 coord_in = coord_out >> 1;
+    coord_in.z = (gidy & 1) * 2 + gidz * 4;
+    coord_in.w = coord_in.z + 1;
+    vxc_short8 src0, src1, data0, data1, dst0, dst1;
+    vxc_half8 tmpDst0, tmpDst1;
+    VXC_ReadImage2DArray(src0, input, coord_in.xyzz,
+             VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src1, input, coord_in.xyww,
+             VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_DP2x8(data0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_lo_2x8);
+    VXC_DP2x8(data1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_hi_2x8);
+
+    vxc_ushort8 ms0;
+    _viv_asm(COPY, ms0, multAndoutZP0, 16);
+    VXC_DP2x8(tmpDst0, data0, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8);
+    VXC_DP2x8(tmpDst1, data1, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8);
+    _viv_asm(COPY, dst0, tmpDst0, 16);
+    _viv_asm(COPY, dst1, tmpDst1, 16);
+    VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    coord_out.x += 8;
+    VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+#define DEPTH2SPACE_CRD_F16_TO_QINT_BLK2(src1_type_name, write_type) \
+__kernel void depth2space_crd_F16to##src1_type_name##_blk2( \
+    image2d_array_t input, image2d_array_t output, int block_size) \
+{ \
+    int gidy = get_global_id(1); \
+    int gidz = get_global_id(2); \
+    int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz); \
+    int4 coord_in = coord_out >> 1; \
+    coord_in.z = (gidy & 1) * 2 + gidz * 4; \
+    coord_in.w = coord_in.z + 1; \
+    vxc_short8 src0, src1, data0, data1; \
+    vxc_half8 tmpDst0, tmpDst1; \
+    VXC_ReadImage2DArray(src0, input, coord_in.xyzz, \
+             VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage2DArray(src1, input, coord_in.xyww, \
+             VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_DP2x8(data0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_lo_2x8); \
+    VXC_DP2x8(data1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_hi_2x8); \
+ \
+    write_type  dst; \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    _viv_asm(COPY, tmpDst0, data0, 16); \
+    _viv_asm(COPY, tmpDst1, data1, 16); \
+    VXC_DP2x8(dst, tmpDst0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8); \
+    VXC_DP2x8(dst, tmpDst1, ms0, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8); \
+    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+}
+DEPTH2SPACE_CRD_F16_TO_QINT_BLK2(U8, vxc_uchar16)
+DEPTH2SPACE_CRD_F16_TO_QINT_BLK2(I8, vxc_char16)
+
+__kernel void depth2space_crd_F16toI16_blk2(
+    image2d_array_t input, image2d_array_t output, int block_size)
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz);
+    int4 coord_in = coord_out >> 1;
+    coord_in.z = (gidy & 1) * 2 + gidz * 4;
+    coord_in.w = coord_in.z + 1;
+    vxc_short8 src0, src1, data0, data1, dst0, dst1;
+    vxc_half8 tmpDst0, tmpDst1;
+    VXC_ReadImage2DArray(src0, input, coord_in.xyzz,
+             VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage2DArray(src1, input, coord_in.xyww,
+             VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    vxc_ushort8 ms0;
+    _viv_asm(COPY, ms0, multAndoutZP0, 16);
+    VXC_DP2x8(data0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_lo_2x8);
+    VXC_DP2x8(data1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_hi_2x8);
+    _viv_asm(COPY, tmpDst0, data0, 16);
+    _viv_asm(COPY, tmpDst1, data1, 16);
+    VXC_DP2x8(dst0, tmpDst0, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8);
+    VXC_DP2x8(dst1, tmpDst1, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8);
+
+    VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    coord_out.x += 8;
+    VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx
index 8b03b5c..a8c4583 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx
@@ -67,6 +67,11 @@ float4 eltwise_unary_mish(float4 x)
     return x;
 }
 
+float4 eltwise_unary_round(float4 x)
+{
+    return convert_float4(convert_int4_rte(x));
+}
+
 _viv_uniform float inputScale;
 _viv_uniform float inputTail;
 _viv_uniform float outputScale;
@@ -187,7 +192,17 @@ ELTSISE_UNARY_2D(hard_sigmoid, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_ucha
 ELTSISE_UNARY_2D(hard_sigmoid, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
 ELTSISE_UNARY_2D(hard_sigmoid, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
 ELTSISE_UNARY_2D(hard_sigmoid, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
-
+//ROUND
+ELTSISE_UNARY_2D(round, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(round, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(round, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(round, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(round, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(round, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(round, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(round, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(round, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(round, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
 
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
@@ -235,3 +250,5 @@ ELTSISE_UNARY_BF16_2D(neg)
 ELTSISE_UNARY_BF16_2D(mish)
 //HARD_SIGMOID
 ELTSISE_UNARY_BF16_2D(hard_sigmoid)
+//ROUND
+ELTSISE_UNARY_BF16_2D(round)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx
index f452849..393e4a0 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx
@@ -67,6 +67,11 @@ float4 eltwise_unary_mish(float4 x)
     return x;
 }
 
+float4 eltwise_unary_round(float4 x)
+{
+    return convert_float4(convert_int4_rte(x));
+}
+
 _viv_uniform float inputScale;
 _viv_uniform float inputTail;
 _viv_uniform float outputScale;
@@ -187,6 +192,17 @@ ELTSISE_UNARY_3D(hard_sigmoid, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_ucha
 ELTSISE_UNARY_3D(hard_sigmoid, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
 ELTSISE_UNARY_3D(hard_sigmoid, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
 ELTSISE_UNARY_3D(hard_sigmoid, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+//ROUND
+ELTSISE_UNARY_3D(round, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(round, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(round, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(round, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(round, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(round, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(round, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(round, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(round, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(round, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
 
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
@@ -232,4 +248,6 @@ ELTSISE_UNARY_BF16(neg)
 //MISH
 ELTSISE_UNARY_BF16(mish)
 //HARD_SIGMOID
-ELTSISE_UNARY_BF16(hard_sigmoid)
\ No newline at end of file
+ELTSISE_UNARY_BF16(hard_sigmoid)
+//ROUND
+ELTSISE_UNARY_BF16(round)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/erf.vx b/src/tim/vx/internal/src/libnnext/ops/vx/erf.vx
new file mode 100644
index 0000000..9247044
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/erf.vx
@@ -0,0 +1,174 @@
+#include "cl_viv_vx_ext.h"
+
+#define MUL2_RSQRTPI    (1.1283791670955126f)
+float eltwise_unary_erf(float x)
+{
+    float res = 0;
+    float tmp = x;
+    float factorial = 1;
+    float x_pow = x;
+    float one = 1.0f;
+    float n = 1;
+
+    while (fabs(tmp) > 1e-5)
+    {
+        res += tmp;
+
+        factorial *= n;
+        one *= -1;
+        x_pow *= x * x;
+        tmp = one / factorial * x_pow / ( 2 * n + 1);
+
+        n += 1.0f;
+    }
+    return res * MUL2_RSQRTPI;
+}
+
+_viv_uniform float inputScale;
+_viv_uniform float inputTail;
+_viv_uniform float outputScale;
+_viv_uniform float outputZP;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4;
+
+#define ELTSISE_UNARY_2D(func_name, src_type_name, dst_type_name, src_type, \
+        src_copy_type, convert_type, dst_type, dst_copy_type) \
+    __kernel void func_name##_##src_type_name##to##dst_type_name##_2D( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+    src_type      src0; \
+    src_copy_type src1; \
+    VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src1, src0, 16); \
+ \
+    float4 vecA; \
+    VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \
+    vecA = vecA * inputScale + inputTail; \
+    vecA.x = eltwise_unary_##func_name(vecA.x); \
+    vecA.y = eltwise_unary_##func_name(vecA.y); \
+    vecA.z = eltwise_unary_##func_name(vecA.z); \
+    vecA.w = eltwise_unary_##func_name(vecA.w); \
+    vecA = vecA * outputScale + outputZP; \
+ \
+    convert_type dst0; \
+    _viv_asm(CONV_RTE, dst0, vecA); \
+    dst_type dst2; \
+    VXC_DP2x8(dst2, dst0, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    dst_copy_type dst; \
+    _viv_asm(COPY, dst, dst2, 16); \
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+//ERF
+ELTSISE_UNARY_2D(erf, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(erf, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(erf, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(erf, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(erf, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_2D(erf, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(erf, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_2D(erf, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_2D(erf, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_2D(erf, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+
+
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+
+#define ELTSISE_UNARY_BF16_2D(func_name) \
+    __kernel void func_name##_BF16toBF16_2D( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+    vxc_ushort8   src0, src1, dst; \
+    VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    float4 vecA; \
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
+    VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \
+    _viv_asm(COPY, vecA, src1, 16); \
+    vecA.x = eltwise_unary_##func_name(vecA.x); \
+    vecA.y = eltwise_unary_##func_name(vecA.y); \
+    vecA.z = eltwise_unary_##func_name(vecA.z); \
+    vecA.w = eltwise_unary_##func_name(vecA.w); \
+ \
+    _viv_asm(COPY, src0, vecA, 16); \
+ \
+    VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+//EXP
+ELTSISE_UNARY_BF16_2D(erf)
+
+#define ELTSISE_UNARY_3D(func_name, src_type_name, dst_type_name, src_type, \
+    src_copy_type, convert_type, dst_type, dst_copy_type) \
+__kernel void func_name##_##src_type_name##to##dst_type_name( \
+__read_only  image2d_array_t  input, \
+__write_only image2d_array_t  output \
+) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    src_type      src0; \
+    src_copy_type src1; \
+    VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src1, src0, 16); \
+ \
+    float4 vecA; \
+    VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \
+    vecA = vecA * inputScale + inputTail; \
+    vecA.x = eltwise_unary_##func_name(vecA.x); \
+    vecA.y = eltwise_unary_##func_name(vecA.y); \
+    vecA.z = eltwise_unary_##func_name(vecA.z); \
+    vecA.w = eltwise_unary_##func_name(vecA.w); \
+    vecA = vecA * outputScale + outputZP; \
+ \
+    convert_type dst0; \
+    _viv_asm(CONV_RTE, dst0, vecA); \
+    dst_type dst2; \
+    VXC_DP2x8(dst2, dst0, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    dst_copy_type dst; \
+    _viv_asm(COPY, dst, dst2, 16); \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+//ERF
+ELTSISE_UNARY_3D(erf, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(erf, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(erf, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(erf, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(erf, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)
+ELTSISE_UNARY_3D(erf, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(erf, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)
+ELTSISE_UNARY_3D(erf, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)
+ELTSISE_UNARY_3D(erf, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)
+ELTSISE_UNARY_3D(erf, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)
+
+#define ELTSISE_UNARY_BF16_3D(func_name) \
+    __kernel void func_name##_BF16toBF16( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    vxc_ushort8   src0, src1, dst; \
+    VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    float4 vecA; \
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
+    VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \
+    _viv_asm(COPY, vecA, src1, 16); \
+    vecA.x = eltwise_unary_##func_name(vecA.x); \
+    vecA.y = eltwise_unary_##func_name(vecA.y); \
+    vecA.z = eltwise_unary_##func_name(vecA.z); \
+    vecA.w = eltwise_unary_##func_name(vecA.w); \
+ \
+    _viv_asm(COPY, src0, vecA, 16); \
+ \
+    VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+//ERF
+ELTSISE_UNARY_BF16_3D(erf)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx
new file mode 100644
index 0000000..9ed2876
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx
@@ -0,0 +1,157 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int indices_num;
+_viv_uniform VXC_512Bits uniExtraCopyDpKeepinEvis_2x8;
+
+__kernel void gather_I8toI8_array(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int4 coord_in = (int4)(gidy, 0, gidx, 0);
+    int4 indice = read_imagei(input1, coord_in.xy);
+    coord_in.w = gidz * axis_num + indice.x;
+
+    Image img1 = create_image_from_image2d(input0, 1);
+    Image img2 = create_image_from_image2d(output, 1);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);
+    __global vxc_char16* data_ptr = (__global vxc_char16*)input_ptr;
+    vxc_char16 src = data_ptr[0];
+    int2 coord = (int2)(gidx, gidz * indices_num + gidy);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+    __global vxc_char16* dst_ptr = (__global vxc_char16*)output_ptr;
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_U8toU8_array(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int4 coord_in = (int4)(gidy, 0, gidx, 0);
+    int4 indice = read_imagei(input1, coord_in.xy);
+    coord_in.w = gidz * axis_num + indice.x;
+
+    Image img1 = create_image_from_image2d(input0, 1);
+    Image img2 = create_image_from_image2d(output, 1);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);
+    __global vxc_uchar16* data_ptr = (__global vxc_uchar16*)input_ptr;
+    vxc_uchar16 src = data_ptr[0];
+    int2 coord = (int2)(gidx, gidz * indices_num + gidy);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+    __global vxc_uchar16* dst_ptr = (__global vxc_uchar16*)output_ptr;
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_I16toI16_array(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int4 coord_in = (int4)(gidy, 0, gidx, 0);
+
+
+    int4 indice = read_imagei(input1, coord_in.xy);
+    coord_in.w = gidz * axis_num + indice.x;
+
+    Image img1 = create_image_from_image2d(input0, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);
+    __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr;
+    vxc_short8 src = data_ptr[0];
+    int2 coord = (int2)(gidx, gidz * indices_num + gidy);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+    __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr;
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_F16toF16_array(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int block_num,
+    int axis_num
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+    int gidz = get_global_id(2);  // block_num
+
+    int4 coord_in = (int4)(gidy, 0, gidx, 0);
+
+    int4 indice = read_imagei(input1, coord_in.xy);
+    coord_in.w = gidz * axis_num + indice.x;
+
+    Image img1 = create_image_from_image2d(input0, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);
+    __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr;
+    vxc_short8 src = data_ptr[0];
+    int2 coord = (int2)(gidx, gidz * indices_num + gidy);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+    __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr;
+    dst_ptr[0] = src;
+}
+
+#define GATHER_AXIS0_ARRAY(src0_type_name, read_type, data_type, write_type) \
+__kernel void gather_##src0_type_name##to##src0_type_name##_axis0_array( \
+    __read_only image2d_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int block_num, \
+    int axis_num \
+    ) \
+{ \
+    Image img0 = create_image_from_image2d(input0, 1); \
+    Image img1 = create_image_from_image2d(input1, 4); \
+    Image img2 = create_image_from_image2d(output, 1); \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
+    uchar* index_ptr = get_image_ptr_from_coord(img1, coord.xz); \
+    __global int* index = (__global int*)index_ptr; \
+    int4 indices = vload4(0, index); \
+ \
+    read_type src, dst; \
+ \
+    uchar* input_ptr = get_image_ptr_from_coord(img0, coord.zy); \
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.xy); \
+    __global data_type* data_ptr = (__global data_type*)input_ptr; \
+    __global write_type* out_ptr = (__global write_type*)output_ptr; \
+    src.s0 = data_ptr[indices.x]; \
+    src.s1 = data_ptr[indices.y]; \
+    src.s2 = data_ptr[indices.z]; \
+    src.s3 = data_ptr[indices.w]; \
+ \
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+                     uniExtraCopyDpKeepinEvis_2x8); \
+    out_ptr[0] = dst.s0123; \
+}
+GATHER_AXIS0_ARRAY(U8, vxc_uchar16, uchar, vxc_uchar4)
+GATHER_AXIS0_ARRAY(I8, vxc_char16,  char, vxc_char4)
+GATHER_AXIS0_ARRAY(I16, vxc_short8, short, vxc_short4)
+GATHER_AXIS0_ARRAY(F16, vxc_short8, short, vxc_short4)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd.vx
index a526d21..f6aa7c7 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd.vx
@@ -12,7 +12,10 @@ __kernel void gather_nd_I8toI8_1D(
     int gidy = get_global_id(1);  // indices_num
 
     int4 coord = (int4)(0, gidy, gidx, 0);
-    int4 indice = read_imagei(input1, coord.xy);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
     coord.w = indice.x;
 
     vxc_char16 src;
@@ -33,7 +36,10 @@ __kernel void gather_nd_U8toU8_1D(
     int gidy = get_global_id(1);  // indices_num
 
     int4 coord = (int4)(0, gidy, gidx, 0);
-    int4 indice = read_imagei(input1, coord.xy);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
     coord.w = indice.x;
 
     vxc_uchar16 src;
@@ -53,7 +59,10 @@ __kernel void gather_nd_I16toI16_1D(
     int gidy = get_global_id(1);  // indices_num
 
     int4 coord = (int4)(0, gidy, gidx, 0);
-    int4 indice = read_imagei(input1, coord.xy);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
     coord.w = indice.x;
 
     vxc_short8 src;
@@ -73,7 +82,10 @@ __kernel void gather_nd_F16toF16_1D(
     int gidy = get_global_id(1);  // indices_num
 
     int4 coord = (int4)(0, gidy, gidx, 0);
-    int4 indice = read_imagei(input1, coord.xy);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
     coord.w = indice.x;
 
     vxc_short8 src;
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d.vx
index 6b3d90a..74c1a22 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d.vx
@@ -12,7 +12,10 @@ __kernel void gather_nd_I8toI8_2D(
     int gidy = get_global_id(1);  // indices_num
 
     int4 coord = (int4)(0, gidy, gidx, 0);
-    int4 indice = read_imagei(input1, coord.xy);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
     indice.x = indice.x * block_size + gidx;
 
     vxc_char16 src;
@@ -33,7 +36,10 @@ __kernel void gather_nd_U8toU8_2D(
     int gidy = get_global_id(1);  // indices_num
 
     int4 coord = (int4)(0, gidy, gidx, 0);
-    int4 indice = read_imagei(input1, coord.xy);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
     indice.x = indice.x * block_size + gidx;
 
     vxc_uchar16 src;
@@ -53,7 +59,10 @@ __kernel void gather_nd_I16toI16_2D(
     int gidy = get_global_id(1);  // indices_num
 
     int4 coord = (int4)(0, gidy, gidx, 0);
-    int4 indice = read_imagei(input1, coord.xy);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
     indice.x = indice.x * block_size + gidx;
 
     vxc_short8 src;
@@ -73,7 +82,10 @@ __kernel void gather_nd_F16toF16_2D(
     int gidy = get_global_id(1);  // indices_num
 
     int4 coord = (int4)(0, gidy, gidx, 0);
-    int4 indice = read_imagei(input1, coord.xy);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
     indice.x = indice.x * block_size + gidx;
 
     vxc_short8 src;
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d_mix.vx
index 6b0be59..e45482c 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d_mix.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d_mix.vx
@@ -26,7 +26,10 @@ __kernel void gather_nd_##src0_type_name##toF16_2D( \
     int gidy = get_global_id(1); \
  \
     int4 coord = (int4)(0, gidy, gidx, 0); \
-    int4 indice = read_imagei(input1, coord.xy); \
+    Image img = create_image_from_image2d(input1, 4); \
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
+    int4 indice = ((int4 *)indice_ptr)[0]; \
+ \
     indice.x = indice.x * block_size + gidx; \
  \
     read_type src; \
@@ -57,7 +60,10 @@ __kernel void gather_nd_F16to##src1_type_name##_2D( \
     int gidy = get_global_id(1); \
  \
     int4 coord = (int4)(0, gidy, gidx, 0); \
-    int4 indice = read_imagei(input1, coord.xy); \
+    Image img = create_image_from_image2d(input1, 4); \
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
+    int4 indice = ((int4 *)indice_ptr)[0]; \
+ \
     indice.x = indice.x * block_size + gidx; \
  \
     vxc_short8 src; \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d.vx
index 2aa9d4c..566aaa5 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d.vx
@@ -12,7 +12,10 @@ __kernel void gather_nd_I8toI8_3D(
     int gidy = get_global_id(1);  // indices_num
 
     int4 coord = (int4)(0, gidy, gidx, 0);
-    int4 indice = read_imagei(input1, coord.xy);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
     indice.x = indice.x * block_size + gidx;
     indice.w = 0;
 
@@ -34,7 +37,11 @@ __kernel void gather_nd_U8toU8_3D(
     int gidy = get_global_id(1);  // indices_num
 
     int4 coord = (int4)(0, gidy, gidx, 0);
-    int4 indice = read_imagei(input1, coord.xy);
+
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
     indice.x = indice.x * block_size + gidx;
     indice.w = 0;
 
@@ -55,7 +62,10 @@ __kernel void gather_nd_I16toI16_3D(
     int gidy = get_global_id(1);  // indices_num
 
     int4 coord = (int4)(0, gidy, gidx, 0);
-    int4 indice = read_imagei(input1, coord.xy);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
     indice.x = indice.x * block_size + gidx;
     indice.w = 0;
 
@@ -76,7 +86,10 @@ __kernel void gather_nd_F16toF16_3D(
     int gidy = get_global_id(1);  // indices_num
 
     int4 coord = (int4)(0, gidy, gidx, 0);
-    int4 indice = read_imagei(input1, coord.xy);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
     indice.x = indice.x * block_size + gidx;
     indice.w = 0;
 
@@ -84,3 +97,4 @@ __kernel void gather_nd_F16toF16_3D(
     VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
     VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d_mix.vx
index 3d92bef..e9ca9ec 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d_mix.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d_mix.vx
@@ -23,7 +23,10 @@ __kernel void gather_nd_##src0_type_name##toF16_3D( \
     int gidy = get_global_id(1); \
  \
     int4 coord = (int4)(0, gidy, gidx, 0); \
-    int4 indice = read_imagei(input1, coord.xy); \
+    Image img = create_image_from_image2d(input1, 4); \
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
+    int4 indice = ((int4 *)indice_ptr)[0]; \
+ \
     indice.x = indice.x * block_size + gidx; \
     indice.w = 0; \
  \
@@ -55,7 +58,10 @@ __kernel void gather_nd_F16to##src1_type_name##_3D( \
     int gidy = get_global_id(1); \
  \
     int4 coord = (int4)(0, gidy, gidx, 0); \
-    int4 indice = read_imagei(input1, coord.xy); \
+    Image img = create_image_from_image2d(input1, 4); \
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
+    int4 indice = ((int4 *)indice_ptr)[0]; \
+ \
     indice.x = indice.x * block_size + gidx; \
     indice.w = 0; \
  \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_mix.vx
index 770498b..8288ab0 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_mix.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_mix.vx
@@ -26,7 +26,10 @@ __kernel void gather_nd_##src0_type_name##toF16_1D( \
     int gidy = get_global_id(1); \
  \
     int4 coord = (int4)(0, gidy, gidx, 0); \
-    int4 indice = read_imagei(input1, coord.xy); \
+    Image img = create_image_from_image2d(input1, 4); \
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
+    int4 indice = ((int4 *)indice_ptr)[0]; \
+ \
     coord.w = indice.x; \
  \
     read_type src; \
@@ -57,7 +60,10 @@ __kernel void gather_nd_F16to##src1_type_name##_1D( \
     int gidy = get_global_id(1); \
  \
     int4 coord = (int4)(0, gidy, gidx, 0); \
-    int4 indice = read_imagei(input1, coord.xy); \
+    Image img = create_image_from_image2d(input1, 4); \
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
+    int4 indice = ((int4 *)indice_ptr)[0]; \
+ \
     coord.w = indice.x; \
  \
     vxc_short8 src; \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16.vx
new file mode 100644
index 0000000..161383d
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16.vx
@@ -0,0 +1,306 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
+_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;
+
+_viv_uniform float outputScale;
+_viv_uniform int output_ZP;
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_F16(
+    image2d_array_t input, image2d_array_t output, float eps, int is2D)
+{
+    int gidx = get_global_id(0) << 3;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(gidx, 0, gidz, 0);
+    vxc_short8 src0;
+    vxc_half8 in_h;
+    vxc_float4 sumsqr;
+    vxc_float4 tmpSumSqr = (vxc_float4)(0);
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    if(gidx < width)
+    {
+        for(coord.y = 0; coord.y < height;)
+        {
+            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            _viv_asm(COPY, in_h, src0, 16);
+            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+                uniFp16SumSqr_dp8x2);
+            tmpSumSqr += sumsqr;
+        }
+    }
+
+    lcl_sum[lidx] = tmpSumSqr.x;
+    lcl_sqr[lidx] = tmpSumSqr.y;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        float sum = 0;
+        float sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            //sum += lcl_sum[i];
+            //sqr += lcl_sqr[i];
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 data = (float4)(sum, sqr, 0, 0);
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_F16_2D(
+    image2d_array_t input, image2d_array_t output, float eps, int is2D)
+{
+    int gidx = get_global_id(0) << 3;
+    int lidx = get_local_id(0);
+
+    int2 coord = (int2)(gidx, get_global_id(1));
+    vxc_short8 src0;
+    vxc_half8 in_h;
+    vxc_float4 sumsqr = (vxc_float4)(0);
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    if(gidx < width)
+    {
+        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        _viv_asm(COPY, in_h, src0, 16);
+        VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+                uniFp16SumSqr_dp8x2);
+    }
+
+    lcl_sum[lidx] = sumsqr.x;
+    lcl_sqr[lidx] = sumsqr.y;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        float sum = 0;
+        float sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            //sum += lcl_sum[i];
+            //sqr += lcl_sqr[i];
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 data = (float4)(sum, sqr, 0, 0);
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toF16(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,
+    float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
+    vxc_short8 src0;
+    vxc_short8 src1;
+    vxc_half8 scale_h, in_h;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    bias_f = read_imagef(bias, coord_para.xy);
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, scale_h, src1, 16);
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_float4  tmpData0, tmpData1;
+    vxc_short8 outval;
+    half4 tmpVal0, tmpVal1;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
+    vxc_half8 dst;
+
+    _viv_asm(COPY, in_h, src0, 16);
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertEndInt16Fp32_4x4);
+
+    vxc_float4 norm;
+    norm = scale_vari * tmpData0 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = scale_vari * tmpData1 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+            uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toF16_2D(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,
+    float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidz = get_global_id(1);
+    int2 coord = (int2)(get_global_id(0), gidz);
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
+    vxc_short8 src0;
+    vxc_short8 src1;
+    vxc_half8 scale_h, in_h;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    bias_f = read_imagef(bias, coord_para.xy);
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, scale_h, src1, 16);
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_float4  tmpData0, tmpData1;
+    vxc_short8 outval;
+    half4 tmpVal0, tmpVal1;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
+    vxc_half8 dst;
+
+    _viv_asm(COPY, in_h, src0, 16);
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        UniFP16toFP32Lo4_dp4x4);
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertEndInt16Fp32_4x4);
+    vxc_float4 norm;
+    norm = scale_vari * tmpData0 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = scale_vari * tmpData1 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+        uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toU8(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,
+    float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
+    vxc_short8 src0;
+    vxc_short8 src1;
+    vxc_half8 scale_h, in_h;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    bias_f = read_imagef(bias, coord_para.xy);
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, scale_h, src1, 16);
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_float4  tmpData0, tmpData1;
+    vxc_uchar16 outval;
+    vxc_int4 tmpVal0, tmpVal1;
+    float alpha = outputScale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;
+
+    _viv_asm(COPY, in_h, src0, 16);
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertEndInt16Fp32_4x4);
+
+    vxc_float4 norm;
+    norm = alpha * tmpData0 + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = alpha * tmpData1 + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+            uniConvertInt32toUint8_2x8);
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toU8_2D(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,
+    float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidz = get_global_id(1);
+    int2 coord = (int2)(get_global_id(0), gidz);
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
+    vxc_short8 src0;
+    vxc_short8 src1;
+    vxc_half8 scale_h, in_h;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    bias_f = read_imagef(bias, coord_para.xy);
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, scale_h, src1, 16);
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_float4  tmpData0, tmpData1;
+    vxc_uchar16 outval;
+    vxc_int4 tmpVal0, tmpVal1;
+    float alpha = outputScale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;
+
+    _viv_asm(COPY, in_h, src0, 16);
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertEndInt16Fp32_4x4);
+    vxc_float4 norm;
+    norm = alpha * tmpData0 + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = alpha * tmpData1 + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+            uniConvertInt32toUint8_2x8);
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16.vx
new file mode 100644
index 0000000..1282e00
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16.vx
@@ -0,0 +1,339 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
+
+_viv_uniform float inFlScale_s2;
+_viv_uniform float input_fl_scale;
+_viv_uniform float inOut_fl_scale;
+_viv_uniform float output_fl_scale;
+
+_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;
+_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4;
+_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8;
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I16(
+    image2d_array_t input,
+    image2d_array_t output,
+              float eps,
+              int is2D)
+{
+    int gidx = get_global_id(0) << 3;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(gidx, 0, gidz, 0);
+    vxc_short8 src0;
+    float sum = 0, sqr = 0;
+    vxc_float4 sumsqr = (vxc_float4)(0);
+    vxc_float4 tmpSumSqr = (vxc_float4)(0);
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+    if(gidx < width)
+    {
+        for(coord.y = 0; coord.y < height;)
+        {
+            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+                uniInt16SumSqr_dp8x2);
+            //tmpSumSqr += sumsqr;
+            tmpSumSqr.x += sumsqr.x;
+            sqr += (sumsqr.y * inFlScale_s2);
+        }
+        sum = tmpSumSqr.x * input_fl_scale;
+        //sqr = tmpSumSqr.y * inFlScale_s2;
+    }
+
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        sum = 0; sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            //sum += lcl_sum[i];
+            //sqr += lcl_sqr[i];
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 data = (float4)(sum, sqr, 0, 0);
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I16_2D(
+    image2d_array_t input,
+    image2d_array_t output,
+              float eps,
+              int is2D)
+{
+    int gidx = get_global_id(0) << 3;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+
+    int2 coord = (int2)(gidx, gidz);
+    vxc_short8 src0;
+    float sum = 0, sqr = 0;
+    vxc_float4 sumsqr = (vxc_float4)(0);
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    if(gidx < width)
+    {
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+                uniInt16SumSqr_dp8x2);
+        sqr = sumsqr.y * inFlScale_s2;
+        sum = sumsqr.x * input_fl_scale;
+        //sqr = tmpSumSqr.y * inFlScale_s2;
+    }
+
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        sum = 0; sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            //sum += lcl_sum[i];
+            //sqr += lcl_sqr[i];
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 data = (float4)(sum, sqr, 0, 0);
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toF16(
+    image2d_array_t input,
+    image2d_t bias,
+    image2d_t scale,
+    image2d_t meanVari,
+    image2d_array_t output,
+              float eps,
+              int is2D,
+              float rSpaceOrg, int pStride)
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
+    vxc_short8 src0;
+    vxc_short8 src1;
+    vxc_half8 scale_h;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    bias_f = read_imagef(bias, coord_para.xy);
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, scale_h, src1, 16);
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        UniFP16toFP32Lo4_dp4x4);
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_float4  tmpData0, tmpData1;
+    vxc_short8 outval;
+    half4 tmpVal0, tmpVal1;
+    float alpha = input_fl_scale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
+    vxc_half8 dst;
+
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertInt16Fp32Fst_4x4);
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertInt16Fp32Secd_4x4);
+
+    vxc_float4 norm;
+    norm = alpha * tmpData0 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData1 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+        uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toF16_2D(
+    image2d_array_t input,
+    image2d_t bias,
+    image2d_t scale,
+    image2d_t meanVari,
+    image2d_array_t output,
+              float eps,
+              int is2D,
+              float rSpaceOrg, int pStride)
+{
+    int gidz = get_global_id(1);
+    int2 coord = (int2)(get_global_id(0), gidz);
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
+    vxc_short8 src0;
+    vxc_short8 src1;
+    vxc_half8 scale_h;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    bias_f = read_imagef(bias, coord_para.xy);
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, scale_h, src1, 16);
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_float4  tmpData0, tmpData1;
+    vxc_short8 outval;
+    half4 tmpVal0, tmpVal1;
+    float alpha = input_fl_scale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
+    vxc_half8 dst;
+
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertInt16Fp32Fst_4x4);
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertInt16Fp32Secd_4x4);
+    vxc_float4 norm;
+    norm = alpha * tmpData0 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData1 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+        uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toI16(
+    image2d_array_t input,
+    image2d_t bias,
+    image2d_t scale,
+    image2d_t meanVari,
+    image2d_array_t output,
+              float eps,
+              int is2D,
+              float rSpaceOrg, int pStride)
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
+    vxc_short8 src0, src2;
+    vxc_short8 src1;
+    vxc_half8 scale_h;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    bias_f = read_imagef(bias, coord_para.xy);
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, scale_h, src1, 16);
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_int4 tmpVal0, tmpVal1;
+    vxc_float4  tmpData0, tmpData1;
+    float alpha = inOut_fl_scale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;
+
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertInt16Fp32Fst_4x4);
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertInt16Fp32Secd_4x4);
+    vxc_float4 norm;
+    norm = tmpData0 * alpha + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = tmpData1 * alpha + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+        uniConvertInt32toInt16_2x8);
+    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toI16_2D(
+    image2d_array_t input,
+    image2d_t bias,
+    image2d_t scale,
+    image2d_t meanVari,
+    image2d_array_t output,
+              float eps,
+              int is2D,
+              float rSpaceOrg, int pStride)
+{
+    int gidz = get_global_id(1);
+    int2 coord = (int2)(get_global_id(0), gidz);
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
+    vxc_short8 src0, src2;
+    vxc_short8 src1;
+    vxc_half8 scale_h;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    bias_f = read_imagef(bias, coord_para.xy);
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, scale_h, src1, 16);
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_int4 tmpVal0, tmpVal1;
+    vxc_float4  tmpData0, tmpData1;
+    float alpha = inOut_fl_scale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;
+
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertInt16Fp32Fst_4x4);
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertInt16Fp32Secd_4x4);
+    vxc_float4 norm;
+    norm = tmpData0 * alpha + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = tmpData1 * alpha + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+            uniConvertInt32toInt16_2x8);
+    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8.vx
new file mode 100644
index 0000000..6a407a3
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8.vx
@@ -0,0 +1,317 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniSumInt8_16x1;
+_viv_uniform VXC_512Bits uniSqrSumInt8_16x1;
+_viv_uniform float inFlScale_s2;
+_viv_uniform float input_fl_scale;
+
+_viv_uniform VXC_512Bits uniConvertDirInt8Fp32_4x4;
+_viv_uniform VXC_512Bits uniConvertEndInt8Fp32_4x4;
+_viv_uniform VXC_512Bits uniConvertTrdInt8Fp32_4x4;
+_viv_uniform VXC_512Bits uniConvertFthInt8Fp32_4x4;
+
+_viv_uniform float inOut_fl_scale;
+_viv_uniform float output_fl_scale;
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I8(
+    image2d_array_t input, image2d_array_t output, float eps, int is2D)
+{
+    int gidx = get_global_id(0) << 4;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(gidx, 0, gidz, 0);
+    vxc_char16 src0;
+    float sum = 0, sqr = 0;
+    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    if(gidx < width)
+    {
+        for(coord.y = 0; coord.y < height;)
+        {
+            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1);
+            tmpSum += (tmpSum1);
+            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1);
+            tmpSqr += (tmpSqr1);
+        }
+        sqr = tmpSqr * inFlScale_s2;
+        sum = tmpSum * input_fl_scale;
+    }
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        sum = 0; sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 data = (float4)(sum, sqr, 0, 0);
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I8_2D(
+    image2d_array_t input, image2d_array_t output, float eps, int is2D)
+{
+    int gidx = get_global_id(0) << 4;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+
+    int2 coord = (int2)(gidx, gidz);
+    vxc_char16 src0;
+    float sum = 0, sqr = 0;
+    int tmpSum1, tmpSqr1;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    if(gidx < width)
+    {
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1);
+        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1);
+        sqr = tmpSqr1 * inFlScale_s2;
+        sum = tmpSum1 * input_fl_scale;
+    }
+
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        sum = 0; sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 data = (float4)(sum, sqr, 0, 0);
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toF16(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
+    vxc_char16 src0;
+    vxc_short8 src1, outval;
+    vxc_half8 scale_h, dst;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    bias_f = read_imagef(bias, coord_para.xy);
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, scale_h, src1, 16);
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
+    half4 tmpVal0, tmpVal1;
+    float alpha = input_fl_scale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
+
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
+
+    norm = alpha * tmpData0 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData1 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    coord.x += 8;
+    norm = alpha * tmpData2 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData3 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toF16_2D(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidz = get_global_id(1);
+    int2 coord = (int2)(get_global_id(0), gidz);
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
+    vxc_char16 src0;
+    vxc_short8 src1, outval;
+    vxc_half8 scale_h, dst;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    bias_f = read_imagef(bias, coord_para.xy);
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, scale_h, src1, 16);
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
+    half4 tmpVal0, tmpVal1;
+    float alpha = input_fl_scale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
+
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
+    norm = alpha * tmpData0 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData1 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    coord.x += 8;
+    norm = alpha * tmpData2 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData3 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toI8(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
+    vxc_char16 src0, src2;
+    vxc_short8 src1;
+    vxc_half8 scale_h;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    bias_f = read_imagef(bias, coord_para.xy);
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, scale_h, src1, 16);
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_int4 tmpVal0, tmpVal1;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
+    float alpha = inOut_fl_scale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;
+
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
+    norm = tmpData0 * alpha + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = tmpData1 * alpha + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    norm = tmpData2 * alpha + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = tmpData3 * alpha + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toI8_2D(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidz = get_global_id(1);
+    int2 coord = (int2)(get_global_id(0), gidz);
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
+    vxc_char16 src0, src2;
+    vxc_short8 src1;
+    vxc_half8 scale_h;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    bias_f = read_imagef(bias, coord_para.xy);
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, scale_h, src1, 16);
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_int4 tmpVal0, tmpVal1;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
+    float alpha = inOut_fl_scale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;
+
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
+    norm = tmpData0 * alpha + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = tmpData1 * alpha + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    norm = tmpData2 * alpha + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = tmpData3 * alpha + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8.vx
new file mode 100644
index 0000000..af20584
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8.vx
@@ -0,0 +1,261 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniSumU8_16x1;
+_viv_uniform VXC_512Bits uniSqrSum_16x1;
+_viv_uniform float input_scale;
+_viv_uniform int inputZP;
+_viv_uniform int sumInZp;
+_viv_uniform int tmpZp1;
+_viv_uniform float e2InScale;
+_viv_uniform float rowSumScale;
+_viv_uniform float scale_inOut;
+_viv_uniform float outputScale;
+_viv_uniform int output_ZP;
+
+_viv_uniform VXC_512Bits uniResetFp32_4x4;
+_viv_uniform int group_stride;
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_U8(
+    image2d_array_t input, image2d_array_t output, float eps, int is2D)
+{
+    int gidx = get_global_id(0) << 4;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(gidx, 0, gidz, 0);
+    vxc_uchar16 src0;
+    float sum = 0, sqr = 0;
+    int tmpSum = 0, tmpSqr = 0, tmpSum1 = 0, tmpSqr1 = 0;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+    if(gidx < width)
+    {
+        for(coord.y = 0; coord.y < height;)
+        {
+            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
+            tmpSum += (tmpSum1);
+            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
+            tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);
+        }
+        sqr += (tmpSqr * e2InScale + rowSumScale);
+        sum = (tmpSum + sumInZp) * input_scale;
+    }
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+        sum = 0; sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+        float4 data = (float4)(sum, sqr, 0, 0);
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_U8_2D(
+    image2d_array_t input, image2d_array_t output, float eps, int is2D)
+{
+    int gidx = get_global_id(0) << 4;
+    int lidx = get_local_id(0);
+
+    int2 coord = (int2)(gidx, get_global_id(1));
+    vxc_uchar16 src0;
+    float sum = 0, sqr = 0;
+    int tmpSqr, tmpSum1, tmpSqr1;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+    if(gidx < width)
+    {
+        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
+        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
+        tmpSqr = tmpSqr1 + tmpZp1 * tmpSum1;
+        sqr = (tmpSqr * e2InScale + rowSumScale);
+        sum = (tmpSum1 + sumInZp) * input_scale;
+    }
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+        sum = 0; sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+        float4 data = (float4)(sum, sqr, 0, 0);
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_meanvari(
+    image2d_t input, image2d_t output, float eps, float group_ratio)
+{
+    int gidx = get_global_id(0);
+    int lidx = get_local_id(0);
+
+    int2 coord = (int2)(gidx, get_global_id(1));
+    vxc_uchar16 src0;
+    float2 sum_sqr = (float2)(0);
+    vxc_float4 mean_vari;
+    VXC_DP4x4(mean_vari, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniResetFp32_4x4);
+
+    __local float2 lcl_data[16];
+    __local float2 lcl_sum[4];
+
+    for(; coord.x < group_stride; coord.x += 64)
+    {
+        mean_vari += read_imagef(input, coord);
+    }
+    lcl_data[lidx] = mean_vari.xy;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(lidx < 4)
+    {
+        float2 tmpSum = (float2)(0);
+        for(int i = lidx; i < 16; i+=4)
+        {
+            tmpSum += lcl_data[i];
+        }
+        lcl_sum[lidx] = tmpSum;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(lidx == 0)
+    {
+        for(int i = 0; i < 4; i++)
+        {
+            sum_sqr += lcl_sum[i];
+        }
+        mean_vari.xy = sum_sqr * group_ratio;
+        mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+        mean_vari.s1 = rsqrt(mean_vari.s1);
+
+        coord.x = 0;
+        write_imagef(output, coord, mean_vari);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toU8(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
+    vxc_uchar16 src0, src2;
+    vxc_short8 src1;
+    vxc_half8 scale_h;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    bias_f = read_imagef(bias, coord_para.xy);
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, scale_h, src1, 16);
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    short zp = inputZP;
+    vxc_int4 tmpVal0, tmpVal1;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
+    float alpha = scale_inOut * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;
+
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
+    norm = tmpData0 * alpha + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = tmpData1 * alpha + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    norm = tmpData2 * alpha + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = tmpData3 * alpha + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toU8_2D(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidz = get_global_id(1);
+    int2 coord = (int2)(get_global_id(0), gidz);
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
+    vxc_uchar16 src0, src2;
+    vxc_short8 src1;
+    vxc_half8 scale_h;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    bias_f = read_imagef(bias, coord_para.xy);
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, scale_h, src1, 16);
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    short zp = inputZP;
+    vxc_int4 tmpVal0, tmpVal1;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
+    float alpha = scale_inOut * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;
+
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
+    norm = tmpData0 * alpha + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = tmpData1 * alpha + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    norm = tmpData2 * alpha + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = tmpData3 * alpha + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8_f16.vx
new file mode 100644
index 0000000..3c1b892
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8_f16.vx
@@ -0,0 +1,114 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
+
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
+_viv_uniform float input_scale;
+_viv_uniform int inputZP;
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toF16(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
+    vxc_uchar16 src0;
+    vxc_short8 src1, outval;
+    vxc_half8 scale_h, dst;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    bias_f = read_imagef(bias, coord_para.xy);
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, scale_h, src1, 16);
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
+    half4 tmpVal0, tmpVal1;
+    float alpha = input_scale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
+
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
+    norm = alpha * tmpData0 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData1 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    coord.x += 8;
+    norm = alpha * tmpData2 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData3 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toF16_2D(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidz = get_global_id(1);
+    int2 coord = (int2)(get_global_id(0), gidz);
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
+    vxc_uchar16 src0;
+    vxc_short8 src1, outval;
+    vxc_half8 scale_h, dst;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    bias_f = read_imagef(bias, coord_para.xy);
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, scale_h, src1, 16);
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
+    half4 tmpVal0, tmpVal1;
+    float alpha = input_scale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
+
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
+    norm = alpha * tmpData0 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData1 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    coord.x += 8;
+    norm = alpha * tmpData2 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData3 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_f16.vx
index c942079..ed18f67 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_f16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_f16.vx
@@ -11,10 +11,7 @@ _viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
 _viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;
 
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_F16(
-    image2d_array_t input,
-    image2d_array_t output,
-              float eps,
-              int rsFlg)
+    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)
 {
     int gidx = get_global_id(0) << 3;
     int lidx = get_local_id(0);
@@ -28,12 +25,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
     __local float lcl_sum[16];
     __local float lcl_sqr[16];
 
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
     if(gidx < width)
     {
         for(coord.y = 0; coord.y < height;)
         {
-            VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
             coord.y++;
             _viv_asm(COPY, in_h, src0, 16);
             VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
@@ -69,10 +71,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
 }
 
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_F16_2D(
-    image2d_array_t input,
-    image2d_array_t output,
-              float eps,
-              int rsFlg)
+    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)
 {
     int gidx = get_global_id(0) << 3;
     int lidx = get_local_id(0);
@@ -130,13 +129,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
 }
 
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16toF16(
-    image2d_array_t input,
-    image2d_array_t bias,
-    image2d_array_t scale,
-    image2d_array_t meanVari,
-    image2d_array_t output,
-              float eps,
-              int rsFlg)
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int rsFlg)
 {
     int gidz = get_global_id(1);
     int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
@@ -153,12 +147,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t
     VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
     bias_f = read_imagef(bias, coord_para);
 
-    coord_para.x = 0;
-    coord_para.y = gidz;
     for(int i = 0; i < group_num; i++)
     {
-        mean_vari += read_imagef(meanVari, coord_para);
-        coord_para.x += 4;
+        mean_vari += read_imagef(meanVari, coord_para.yx);
+        coord_para.y += 4;
     }
     mean_vari *= dimRatio;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
@@ -171,10 +163,19 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t
     bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
     vxc_half8 dst;
 
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr);
+
     for(coord.y = 0; coord.y < height; coord.y++)
     {
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
     _viv_asm(COPY, in_h, src0, 16);
 
     VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
@@ -190,18 +191,14 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
         uniConvertHalfToFp16_2x8);
     _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \
+                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
     }
 }
 
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16toF16_2D(
-    image2d_array_t input,
-    image2d_array_t bias,
-    image2d_array_t scale,
-    image2d_array_t meanVari,
-    image2d_array_t output,
-              float eps,
-              int rsFlg)
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int rsFlg)
 {
     int gidz = get_global_id(1);
     int gidy = gidz * height;
@@ -220,12 +217,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t
     VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
     bias_f = read_imagef(bias, coord_para);
 
-    coord_para.x = 0;
-    coord_para.y = gidz;
     for(int i = 0; i < group_num; i++)
     {
-        mean_vari += read_imagef(meanVari, coord_para);
-        coord_para.x += 4;
+        mean_vari += read_imagef(meanVari, coord_para.yx);
+        coord_para.y += 4;
     }
     mean_vari *= dimRatio;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i16.vx
index cedc0a2..523bb38 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i16.vx
@@ -35,12 +35,16 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
     __local float lcl_sum[16];
     __local float lcl_sqr[16];
 
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
     if(gidx < width)
     {
         for(coord.y = 0; coord.y < height;)
         {
-            VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
             coord.y++;
             VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
                 uniInt16SumSqr_dp8x2);
@@ -144,7 +148,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
     image2d_array_t input,
     image2d_array_t bias,
     image2d_array_t scale,
-    image2d_array_t meanVari,
+    image2d_t meanVari,
     image2d_array_t output,
               float eps,
               int rsFlg)
@@ -166,12 +170,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
 
     bias_f = read_imagef(bias, coord_para);
 
-    coord_para.x = 0;
-    coord_para.y = gidz;
     for(int i = 0; i < group_num; i++)
     {
-        mean_vari += read_imagef(meanVari, coord_para);
-        coord_para.x += 4;
+        mean_vari += read_imagef(meanVari, coord_para.yx);
+        coord_para.y += 4;
     }
     mean_vari *= dimRatio;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
@@ -185,11 +187,19 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
     bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
     vxc_half8 dst;
 
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr);
+
     for(coord.y = 0; coord.y < height; coord.y++)
     {
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
     VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
         uniConvertInt16Fp32Fst_4x4);
     VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
@@ -203,7 +213,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
         uniConvertHalfToFp16_2x8);
     _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \
+                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
     }
 }
 
@@ -211,7 +222,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
     image2d_array_t input,
     image2d_array_t bias,
     image2d_array_t scale,
-    image2d_array_t meanVari,
+    image2d_t meanVari,
     image2d_array_t output,
               float eps,
               int rsFlg)
@@ -235,12 +246,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
 
     bias_f = read_imagef(bias, coord_para);
 
-    coord_para.x = 0;
-    coord_para.y = gidz;
     for(int i = 0; i < group_num; i++)
     {
-        mean_vari += read_imagef(meanVari, coord_para);
-        coord_para.x += 4;
+        mean_vari += read_imagef(meanVari, coord_para.yx);
+        coord_para.y += 4;
     }
     mean_vari *= dimRatio;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
@@ -279,7 +288,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
     image2d_array_t input,
     image2d_array_t bias,
     image2d_array_t scale,
-    image2d_array_t meanVari,
+    image2d_t meanVari,
     image2d_array_t output,
               float eps,
               int rsFlg)
@@ -299,12 +308,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
     VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
 
     bias_f = read_imagef(bias, coord_para);
-    coord_para.x = 0;
-    coord_para.y = gidz;
     for(int i = 0; i < group_num; i++)
     {
-        mean_vari += read_imagef(meanVari, coord_para);
-        coord_para.x += 4;
+        mean_vari += read_imagef(meanVari, coord_para.yx);
+        coord_para.y += 4;
     }
     mean_vari *= dimRatio;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
@@ -316,10 +323,18 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
     float alpha = inOut_fl_scale * scale_vari;
     bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;
 
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr);
     for(coord.y = 0; coord.y < height; coord.y++)
     {
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
     VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
         uniConvertInt16Fp32Fst_4x4);
     VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
@@ -331,7 +346,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
     tmpVal1 = convert_int4_rte(norm);
     VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
         uniConvertInt32toInt16_2x8);
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \
+                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
     }
 }
 
@@ -339,7 +355,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
     image2d_array_t input,
     image2d_array_t bias,
     image2d_array_t scale,
-    image2d_array_t meanVari,
+    image2d_t meanVari,
     image2d_array_t output,
               float eps,
               int rsFlg)
@@ -361,12 +377,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
     VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
 
     bias_f = read_imagef(bias, coord_para);
-    coord_para.x = 0;
-    coord_para.y = gidz;
     for(int i = 0; i < group_num; i++)
     {
-        mean_vari += read_imagef(meanVari, coord_para);
-        coord_para.x += 4;
+        mean_vari += read_imagef(meanVari, coord_para.yx);
+        coord_para.y += 4;
     }
     mean_vari *= dimRatio;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i8.vx
index 489da14..dc19b5e 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_i8.vx
@@ -22,10 +22,7 @@ _viv_uniform float inOut_fl_scale;
 _viv_uniform float output_fl_scale;
 
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I8(
-    image2d_array_t input,
-    image2d_array_t output,
-              float eps,
-              int rsFlg)
+    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)
 {
     int gidx = get_global_id(0) << 4;
     int lidx = get_local_id(0);
@@ -33,18 +30,22 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
     int4 coord = (int4)(gidx, 0, gidz, 0);
     vxc_char16 src0;
     float sum = 0, sqr = 0;
-    int tmpSum = 0, tmpSqr = 0;
-    int tmpSum1, tmpSqr1;
+    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;
 
     __local float lcl_sum[16];
     __local float lcl_sqr[16];
 
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
     if(gidx < width)
     {
         for(coord.y = 0; coord.y < height;)
         {
-            VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
             coord.y++;
             VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1);
             tmpSum += (tmpSum1);
@@ -54,7 +55,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
         sqr = tmpSqr * inFlScale_s2;
         sum = tmpSum * input_fl_scale;
     }
-
     lcl_sum[lidx] = sum;
     lcl_sqr[lidx] = sqr;
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -69,8 +69,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
         sum = 0; sqr = 0;
         for(int i = 0; i < 4; i++)
         {
-            //sum += lcl_sum[i];
-            //sqr += lcl_sqr[i];
             sum += dot(tmp_sum[i], one);
             sqr += dot(tmp_sqr[i], one);
         }
@@ -81,10 +79,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
 }
 
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I8_2D(
-    image2d_array_t input,
-    image2d_array_t output,
-              float eps,
-              int rsFlg)
+    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)
 {
     int gidx = get_global_id(0) << 4;
     int lidx = get_local_id(0);
@@ -94,8 +89,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
     int2 coord = (int2)(gidx, gidy);
     vxc_char16 src0;
     float sum = 0, sqr = 0;
-    int tmpSum = 0, tmpSqr = 0;
-    int tmpSum1, tmpSqr1;
+    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;
 
     __local float lcl_sum[16];
     __local float lcl_sqr[16];
@@ -103,7 +97,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
     int endH = gidy + height;
     if(gidx < width)
     {
-        tmpSqr = 0;
         for(; coord.y < endH;)
         {
             VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
@@ -132,8 +125,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
         sum = 0; sqr = 0;
         for(int i = 0; i < 4; i++)
         {
-            //sum += lcl_sum[i];
-            //sqr += lcl_sqr[i];
             sum += dot(tmp_sum[i], one);
             sqr += dot(tmp_sqr[i], one);
         }
@@ -144,94 +135,81 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
 }
 
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toF16(
-    image2d_array_t input,
-    image2d_array_t bias,
-    image2d_array_t scale,
-    image2d_array_t meanVari,
-    image2d_array_t output,
-              float eps,
-              int rsFlg)
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int rsFlg)
 {
     int gidz = get_global_id(1);
     int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
     int4 coord_para = (int4)(gidz, 0, 0, 0);
     vxc_char16 src0;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
+    vxc_short8 src1, outval;
+    vxc_half8 scale_h, dst;
     float scale_vari, bias_val;
     vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
 
     VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
         VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
     _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        UniFP16toFP32Lo4_dp4x4);
-
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
     bias_f = read_imagef(bias, coord_para);
 
-    coord_para.x = 0;
-    coord_para.y = gidz;
     for(int i = 0; i < group_num; i++)
     {
-        mean_vari += read_imagef(meanVari, coord_para);
-        coord_para.x += 4;
+        mean_vari += read_imagef(meanVari, coord_para.yx);
+        coord_para.y += 4;
     }
     mean_vari *= dimRatio;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
     mean_vari.s1 = rsqrt(mean_vari.s1);
 
     scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
-    vxc_short8 outval;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
     half4 tmpVal0, tmpVal1;
     float alpha = input_fl_scale * scale_vari;
     bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-    vxc_half8 dst;
+
+    coord_para = coord;
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_para.z, baseAddr);
 
     for(coord.y = 0; coord.y < height;)
     {
-    coord_para = coord;
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    coord_para.xy = coord.xy;
     coord.y++;
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertDirInt8Fp32_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertEndInt8Fp32_4x4);
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertTrdInt8Fp32_4x4);
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertFthInt8Fp32_4x4);
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
 
-    vxc_float4 norm;
     norm = alpha * tmpData0 + bias_val;
     _viv_asm(CONV, tmpVal0, norm);
     norm = alpha * tmpData1 + bias_val;
     _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-        uniConvertHalfToFp16_2x8);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
     _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
     coord_para.x += 8;
     norm = alpha * tmpData2 + bias_val;
     _viv_asm(CONV, tmpVal0, norm);
     norm = alpha * tmpData3 + bias_val;
     _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-        uniConvertHalfToFp16_2x8);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
     _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
     }
 }
 
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toF16_2D(
-    image2d_array_t input,
-    image2d_array_t bias,
-    image2d_array_t scale,
-    image2d_array_t meanVari,
-    image2d_array_t output,
-              float eps,
-              int rsFlg)
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int rsFlg)
 {
     int gidz = get_global_id(1);
     int gidy = gidz * height;
@@ -239,59 +217,48 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to
     int4 coord_para = (int4)(gidz, 0, 0, 0);
     int endH = gidy + height;
     vxc_char16 src0;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
+    vxc_short8 src1, outval;
+    vxc_half8 scale_h, dst;
     float scale_vari, bias_val;
     vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
 
     VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
         VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
     _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        UniFP16toFP32Lo4_dp4x4);
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
 
     bias_f = read_imagef(bias, coord_para);
 
-    coord_para.x = 0;
-    coord_para.y = gidz;
     for(int i = 0; i < group_num; i++)
     {
-        mean_vari += read_imagef(meanVari, coord_para);
-        coord_para.x += 4;
+        mean_vari += read_imagef(meanVari, coord_para.yx);
+        coord_para.y += 4;
     }
     mean_vari *= dimRatio;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
     mean_vari.s1 = rsqrt(mean_vari.s1);
 
     scale_vari = scale_f.s0 * mean_vari.s1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
-    vxc_short8 outval;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
     half4 tmpVal0, tmpVal1;
     float alpha = input_fl_scale * scale_vari;
     bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-    vxc_half8 dst;
 
     for(; coord.y < endH;)
     {
-    coord_para = coord;
     VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
         VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    coord_para = coord;
     coord.y++;
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertDirInt8Fp32_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertEndInt8Fp32_4x4);
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertTrdInt8Fp32_4x4);
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertFthInt8Fp32_4x4);
-    vxc_float4 norm;
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
     norm = alpha * tmpData0 + bias_val;
     _viv_asm(CONV, tmpVal0, norm);
     norm = alpha * tmpData1 + bias_val;
     _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-        uniConvertHalfToFp16_2x8);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
     _viv_asm(COPY, outval, dst, 16);
     VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
     coord_para.x += 8;
@@ -299,21 +266,15 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to
     _viv_asm(CONV, tmpVal0, norm);
     norm = alpha * tmpData3 + bias_val;
     _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-        uniConvertHalfToFp16_2x8);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
     _viv_asm(COPY, outval, dst, 16);
     VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
     }
 }
 
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toI8(
-    image2d_array_t input,
-    image2d_array_t bias,
-    image2d_array_t scale,
-    image2d_array_t meanVari,
-    image2d_array_t output,
-              float eps,
-              int rsFlg)
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int rsFlg)
 {
     int gidz = get_global_id(1);
     int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
@@ -330,12 +291,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to
     VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
 
     bias_f = read_imagef(bias, coord_para);
-    coord_para.x = 0;
-    coord_para.y = gidz;
     for(int i = 0; i < group_num; i++)
     {
-        mean_vari += read_imagef(meanVari, coord_para);
-        coord_para.x += 4;
+        mean_vari += read_imagef(meanVari, coord_para.yx);
+        coord_para.y += 4;
     }
     mean_vari *= dimRatio;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
@@ -343,47 +302,44 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to
 
     scale_vari = scale_f.s0 * mean_vari.s1;
     vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
     float alpha = inOut_fl_scale * scale_vari;
     bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;
 
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr);
+
     for(coord.y = 0; coord.y < height; coord.y++)
     {
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertDirInt8Fp32_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertEndInt8Fp32_4x4);
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertTrdInt8Fp32_4x4);
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertFthInt8Fp32_4x4);
-    vxc_float4 norm;
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
     norm = tmpData0 * alpha + bias_val;
     tmpVal0 = convert_int4_rte(norm);
     norm = tmpData1 * alpha + bias_val;
     tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-        uniConvertInt32toUint8_2x8);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
     norm = tmpData2 * alpha + bias_val;
     tmpVal0 = convert_int4_rte(norm);
     norm = tmpData3 * alpha + bias_val;
     tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\
-        uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
     }
 }
 
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toI8_2D(
-    image2d_array_t input,
-    image2d_array_t bias,
-    image2d_array_t scale,
-    image2d_array_t meanVari,
-    image2d_array_t output,
-              float eps,
-              int rsFlg)
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int rsFlg)
 {
     int gidz = get_global_id(1);
     int gidy = gidz * height;
@@ -402,12 +358,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to
     VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
 
     bias_f = read_imagef(bias, coord_para);
-    coord_para.x = 0;
-    coord_para.y = gidz;
     for(int i = 0; i < group_num; i++)
     {
-        mean_vari += read_imagef(meanVari, coord_para);
-        coord_para.x += 4;
+        mean_vari += read_imagef(meanVari, coord_para.yx);
+        coord_para.y += 4;
     }
     mean_vari *= dimRatio;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
@@ -415,35 +369,27 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to
 
     scale_vari = scale_f.s0 * mean_vari.s1;
     vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
     float alpha = inOut_fl_scale * scale_vari;
     bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;
 
     for(; coord.y < endH; coord.y++)
     {
-    VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertDirInt8Fp32_4x4);
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertEndInt8Fp32_4x4);
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertTrdInt8Fp32_4x4);
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvertFthInt8Fp32_4x4);
-    vxc_float4 norm;
+    VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
     norm = tmpData0 * alpha + bias_val;
     tmpVal0 = convert_int4_rte(norm);
     norm = tmpData1 * alpha + bias_val;
     tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-        uniConvertInt32toUint8_2x8);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
     norm = tmpData2 * alpha + bias_val;
     tmpVal0 = convert_int4_rte(norm);
     norm = tmpData3 * alpha + bias_val;
     tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\
-        uniConvertInt32toUint8_2x8);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
     VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
     }
 }
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32.vx
new file mode 100644
index 0000000..845945c
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32.vx
@@ -0,0 +1,285 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int height;
+_viv_uniform float dimRatio;
+_viv_uniform int group_num;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
+_viv_uniform int inputZP;
+_viv_uniform float scale_inOut;
+_viv_uniform float outputScale;
+_viv_uniform int output_ZP;
+_viv_uniform float inOut_fl_scale;
+_viv_uniform float output_fl_scale;
+_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4;
+_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8;
+
+#define INSTANCENORM_8BITS_F32(src1_type_name, read_type) \
+__kernel void instance_norm_##src1_type_name##F32to##src1_type_name( \
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, \
+    image2d_array_t output, float eps, int rsFlg) \
+{ \
+    int gidz = get_global_id(1); \
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0); \
+    int2 coord_para = (int2)(gidz, 0); \
+    read_type src0, src2; \
+    float scale_vari, bias_val; \
+    vxc_float4 mean_vari = (vxc_float4)(0); \
+ \
+    Image img1 = create_image_from_image2d(bias, 4); \
+    Image img2 = create_image_from_image2d(scale, 4); \
+    Image img3 = create_image_from_image2d(meanVari, 4); \
+    __global float* bias_ptr = (__global float*)img1.ptr; \
+    __global float* scal_ptr = (__global float*)img2.ptr; \
+    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); \
+    __global float4* vari_ptr = (__global float4*)sumVari_ptr; \
+ \
+    float bval = bias_ptr[gidz]; \
+    float sval = scal_ptr[gidz]; \
+ \
+    for(int i = 0; i < group_num; i++) \
+    { \
+        mean_vari += vari_ptr[i]; \
+    } \
+    mean_vari *= dimRatio; \
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
+    mean_vari.s1 = rsqrt(mean_vari.s1); \
+ \
+    scale_vari = sval * mean_vari.s1; \
+    short zp = inputZP; \
+    vxc_int4 tmpVal0, tmpVal1; \
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \
+    float alpha = scale_inOut * scale_vari; \
+    bias_val = (bval - scale_vari * mean_vari.s0) * outputScale + output_ZP; \
+ \
+    int8 input_desc, output_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord.z, baseAddr_a); \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord.w, baseAddr); \
+ \
+    for(coord.y = 0; coord.y < height; coord.y++) \
+    { \
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+             uniConvert1stUint8SubZpToFp32_4x4); \
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+             uniConvert2ndUint8SubZpToFp32_4x4); \
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+             uniConvert3rdUint8SubZpToFp32_4x4); \
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+             uniConvert4thUint8SubZpToFp32_4x4); \
+    norm = tmpData0 * alpha + bias_val; \
+    tmpVal0 = convert_int4_rte(norm); \
+    norm = tmpData1 * alpha + bias_val; \
+    tmpVal1 = convert_int4_rte(norm); \
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \
+    norm = tmpData2 * alpha + bias_val; \
+    tmpVal0 = convert_int4_rte(norm); \
+    norm = tmpData3 * alpha + bias_val; \
+    tmpVal1 = convert_int4_rte(norm); \
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
+    } \
+}
+INSTANCENORM_8BITS_F32(U8, vxc_uchar16)
+INSTANCENORM_8BITS_F32(I8, vxc_char16)
+
+#define INSTANCENORM_8BITS_F32_2D(src1_type_name, read_type) \
+__kernel void instance_norm_##src1_type_name##F32to##src1_type_name##_2D( \
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, \
+    image2d_array_t output, float eps, int rsFlg) \
+{ \
+    int gidz = get_global_id(1); \
+    int gidy = gidz * height; \
+    int2 coord = (int2)(get_global_id(0), gidy); \
+    int2 coord_para = (int2)(gidz, 0); \
+    int endH = gidy + height; \
+    read_type src0, src2; \
+    float scale_vari, bias_val; \
+    vxc_float4 mean_vari = (vxc_float4)(0); \
+ \
+    Image img1 = create_image_from_image2d(bias, 4); \
+    Image img2 = create_image_from_image2d(scale, 4); \
+    Image img3 = create_image_from_image2d(meanVari, 4); \
+    __global float* bias_ptr = (__global float*)img1.ptr; \
+    __global float* scal_ptr = (__global float*)img2.ptr; \
+    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); \
+    __global float4* vari_ptr = (__global float4*)sumVari_ptr; \
+ \
+    float bval = bias_ptr[gidz]; \
+    float sval = scal_ptr[gidz]; \
+ \
+    for(int i = 0; i < group_num; i++) \
+    { \
+        mean_vari += vari_ptr[i]; \
+    } \
+ \
+    mean_vari *= dimRatio; \
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
+    mean_vari.s1 = rsqrt(mean_vari.s1); \
+ \
+    scale_vari = sval * mean_vari.s1; \
+    short zp = inputZP; \
+    vxc_int4 tmpVal0, tmpVal1; \
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \
+    float alpha = scale_inOut * scale_vari; \
+    bias_val = (bval - scale_vari * mean_vari.s0) * outputScale + output_ZP; \
+ \
+    for(; coord.y < endH; coord.y++) \
+    { \
+    VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+             uniConvert1stUint8SubZpToFp32_4x4); \
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+             uniConvert2ndUint8SubZpToFp32_4x4); \
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+             uniConvert3rdUint8SubZpToFp32_4x4); \
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+             uniConvert4thUint8SubZpToFp32_4x4); \
+    norm = tmpData0 * alpha + bias_val; \
+    tmpVal0 = convert_int4_rte(norm); \
+    norm = tmpData1 * alpha + bias_val; \
+    tmpVal1 = convert_int4_rte(norm); \
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \
+    norm = tmpData2 * alpha + bias_val; \
+    tmpVal0 = convert_int4_rte(norm); \
+    norm = tmpData3 * alpha + bias_val; \
+    tmpVal1 = convert_int4_rte(norm); \
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \
+    VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+INSTANCENORM_8BITS_F32_2D(U8, vxc_uchar16)
+INSTANCENORM_8BITS_F32_2D(I8, vxc_char16)
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F32toI16(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int rsFlg)
+{
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int2 coord_para = (int2)(gidz, 0);
+    vxc_short8 src0, src2;
+    float scale_vari, bias_val;
+    vxc_float4  mean_vari = (vxc_float4)(0);
+
+    Image img1 = create_image_from_image2d(bias, 4);
+    Image img2 = create_image_from_image2d(scale, 4);
+    Image img3 = create_image_from_image2d(meanVari, 4);
+    __global float* bias_ptr = (__global float*)img1.ptr;
+    __global float* scal_ptr = (__global float*)img2.ptr;
+    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);
+    __global float4* vari_ptr = (__global float4*)sumVari_ptr;
+
+    float bval = bias_ptr[gidz];
+    float sval = scal_ptr[gidz];
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += vari_ptr[i];
+    }
+
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    scale_vari = sval * mean_vari.s1;
+    vxc_int4 tmpVal0, tmpVal1;
+    vxc_float4  tmpData0, tmpData1;
+    float alpha = inOut_fl_scale * scale_vari;
+    bias_val = (bval - scale_vari * mean_vari.s0) * output_fl_scale;
+
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr);
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertInt16Fp32Fst_4x4);
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertInt16Fp32Secd_4x4);
+    vxc_float4 norm;
+    norm = tmpData0 * alpha + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = tmpData1 * alpha + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+        uniConvertInt32toInt16_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \
+                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F32toI16_2D(
+    image2d_t input, image2d_t bias, image2d_t scale,
+    image2d_t meanVari, image2d_array_t output, float eps, int rsFlg)
+{
+    int gidz = get_global_id(1);
+    int gidy = gidz * height;
+    int2 coord = (int2)(get_global_id(0), gidy);
+    int2 coord_para = (int2)(gidz, 0);
+    int endH = gidy + height;
+    vxc_short8 src0, src2;
+    float scale_vari, bias_val;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    Image img1 = create_image_from_image2d(bias, 4);
+    Image img2 = create_image_from_image2d(scale, 4);
+    Image img3 = create_image_from_image2d(meanVari, 4);
+    __global float* bias_ptr = (__global float*)img1.ptr;
+    __global float* scal_ptr = (__global float*)img2.ptr;
+    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);
+    __global float4* vari_ptr = (__global float4*)sumVari_ptr;
+
+    float bval = bias_ptr[gidz];
+    float sval = scal_ptr[gidz];
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += vari_ptr[i];
+    }
+
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    scale_vari = sval * mean_vari.s1;
+    vxc_int4 tmpVal0, tmpVal1;
+    vxc_float4  tmpData0, tmpData1;
+    float alpha = inOut_fl_scale * scale_vari;
+    bias_val = (bval - scale_vari * mean_vari.s0) * output_fl_scale;
+
+    for(; coord.y < endH; coord.y++)
+    {
+    VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertInt16Fp32Fst_4x4);
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertInt16Fp32Secd_4x4);
+    vxc_float4 norm;
+    norm = tmpData0 * alpha + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = tmpData1 * alpha + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+        uniConvertInt32toInt16_2x8);
+    VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_bf16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_bf16.vx
new file mode 100644
index 0000000..771b319
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_bf16.vx
@@ -0,0 +1,253 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform float dimRatio;
+_viv_uniform int group_num;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+
+constant vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+constant float4 one = (float4)(1.0, 1.0, 1.0, 1.0);
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_BF16(
+    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)
+{
+    int gidx = get_global_id(0) << 3;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(gidx, 0, gidz, 0);
+    vxc_short8 src0, src1, src2;
+    float4 srcA, srcB;
+    vxc_float sum = 0, sqr = 0;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    if(gidx < width)
+    {
+        for(coord.y = 0; coord.y < height;)
+        {
+            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                         uniConvBF16toF32_Part0_2x8);
+            VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                         uniConvBF16toF32_Part1_2x8);
+            _viv_asm(COPY, srcA, src1, 16);
+            _viv_asm(COPY, srcB, src2, 16);
+            sum += dot(srcA, one) + dot(srcB, one);
+            sqr += dot(srcA * srcA, one) + dot(srcB * srcB, one);
+        }
+    }
+
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
+    if(lidx == 0)
+    {
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        sum = 0;
+        sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 data = (float4)(sum, sqr, 0, 0);
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_BF16_2D(
+    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)
+{
+    int gidx = get_global_id(0) << 3;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+    int gidy = gidz * height;
+
+    int2 coord = (int2)(gidx, gidy);
+    vxc_short8 src0, src1, src2;
+    float4 srcA, srcB;
+    vxc_float sum = 0, sqr = 0;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    int endH = gidy + height;
+    if(gidx < width)
+    {
+        for(; coord.y < endH;)
+        {
+            VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                         uniConvBF16toF32_Part0_2x8);
+            VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                         uniConvBF16toF32_Part1_2x8);
+            _viv_asm(COPY, srcA, src1, 16);
+            _viv_asm(COPY, srcB, src2, 16);
+            sum += dot(srcA, one) + dot(srcB, one);
+            sqr += dot(srcA * srcA, one) + dot(srcB * srcB, one);
+        }
+    }
+
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);
+    if(lidx == 0)
+    {
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        sum = 0;
+        sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 data = (float4)(sum, sqr, 0, 0);
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16F32toBF16(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int rsFlg)
+{
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    vxc_short8 src0, src1, src2;
+    float scale_vari, bias_val;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    Image img1 = create_image_from_image2d(bias, 4);
+    Image img2 = create_image_from_image2d(scale, 4);
+    Image img3 = create_image_from_image2d(meanVari, 4);
+    __global float* bias_ptr = (__global float*)img1.ptr;
+    __global float* scal_ptr = (__global float*)img2.ptr;
+    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord.wz);
+    __global float4* vari_ptr = (__global float4*)sumVari_ptr;
+
+    float bval = bias_ptr[gidz];
+    float sval = scal_ptr[gidz];
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += vari_ptr[i];
+    }
+
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    scale_vari = sval * mean_vari.s1;
+    vxc_float4  tmpData0, tmpData1;
+    bias_val = (bval - scale_vari * mean_vari.s0);
+
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr);
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                    uniConvBF16toF32_Part0_2x8);
+    VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                    uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, tmpData0, src1, 16);
+    _viv_asm(COPY, tmpData1, src2, 16);
+
+    vxc_float4 norm;
+    norm = scale_vari * tmpData0 + bias_val;
+    _viv_asm(COPY, src0, norm, 16);
+    norm = scale_vari * tmpData1 + bias_val;
+    _viv_asm(COPY, src1, norm, 16);
+    VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \
+                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16F32toBF16_2D(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int rsFlg)
+{
+    int gidz = get_global_id(1);
+    int gidy = gidz * height;
+    int2 coord = (int2)(get_global_id(0), gidy);
+    int2 coord_para = (int2)(gidz, 0);
+    int endH = gidy + height;
+    vxc_short8 src0, src1, src2;
+    float scale_vari, bias_val;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    Image img1 = create_image_from_image2d(bias, 4);
+    Image img2 = create_image_from_image2d(scale, 4);
+    Image img3 = create_image_from_image2d(meanVari, 4);
+    __global float* bias_ptr = (__global float*)img1.ptr;
+    __global float* scal_ptr = (__global float*)img2.ptr;
+    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);
+    __global float4* vari_ptr = (__global float4*)sumVari_ptr;
+
+    float bval = bias_ptr[gidz];
+    float sval = scal_ptr[gidz];
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += vari_ptr[i];
+    }
+
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    scale_vari = sval * mean_vari.s1;
+    vxc_float4  tmpData0, tmpData1;
+    bias_val = (bval - scale_vari * mean_vari.s0);
+
+    for(; coord.y < endH; coord.y++)
+    {
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                    uniConvBF16toF32_Part0_2x8);
+    VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                    uniConvBF16toF32_Part1_2x8);
+    _viv_asm(COPY, tmpData0, src1, 16);
+    _viv_asm(COPY, tmpData1, src2, 16);
+
+    vxc_float4 norm;
+    norm = scale_vari * tmpData0 + bias_val;
+    _viv_asm(COPY, src0, norm, 16);
+    norm = scale_vari * tmpData1 + bias_val;
+    _viv_asm(COPY, src1, norm, 16);
+    VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_f16.vx
new file mode 100644
index 0000000..81e5ec5
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_scale_f32_f16.vx
@@ -0,0 +1,143 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int height;
+_viv_uniform float dimRatio;
+_viv_uniform int group_num;
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
+
+_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F32toF16(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int rsFlg)
+{
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    vxc_short8 src0;
+    vxc_half8  in_h;
+    float scale_vari, bias_val;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    Image img1 = create_image_from_image2d(bias, 4);
+    Image img2 = create_image_from_image2d(scale, 4);
+    Image img3 = create_image_from_image2d(meanVari, 4);
+    __global float* bias_ptr = (__global float*)img1.ptr;
+    __global float* scal_ptr = (__global float*)img2.ptr;
+    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord.wz);
+    __global float4* vari_ptr = (__global float4*)sumVari_ptr;
+
+    float bval = bias_ptr[gidz];
+    float sval = scal_ptr[gidz];
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += vari_ptr[i];
+    }
+
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    scale_vari = sval * mean_vari.s1;
+    vxc_float4  tmpData0, tmpData1;
+    vxc_short8 outval;
+    half4 tmpVal0, tmpVal1;
+    bias_val = (bval - scale_vari * mean_vari.s0);
+    vxc_half8 dst;
+
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr);
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, in_h, src0, 16);
+
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        UniFP16toFP32Lo4_dp4x4);
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertEndInt16Fp32_4x4);
+
+    vxc_float4 norm;
+    norm = scale_vari * tmpData0 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = scale_vari * tmpData1 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+        uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \
+                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F32toF16_2D(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int rsFlg)
+{
+    int gidz = get_global_id(1);
+    int gidy = gidz * height;
+    int2 coord = (int2)(get_global_id(0), gidy);
+    int2 coord_para = (int2)(gidz, 0);
+    int endH = gidy + height;
+    vxc_short8 src0;
+    vxc_half8  in_h;
+    float scale_vari, bias_val;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    Image img1 = create_image_from_image2d(bias, 4);
+    Image img2 = create_image_from_image2d(scale, 4);
+    Image img3 = create_image_from_image2d(meanVari, 4);
+    __global float* bias_ptr = (__global float*)img1.ptr;
+    __global float* scal_ptr = (__global float*)img2.ptr;
+    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);
+    __global float4* vari_ptr = (__global float4*)sumVari_ptr;
+
+    float bval = bias_ptr[gidz];
+    float sval = scal_ptr[gidz];
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += vari_ptr[i];
+    }
+
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    scale_vari = sval * mean_vari.s1;
+    vxc_float4  tmpData0, tmpData1;
+    vxc_short8 outval;
+    half4 tmpVal0, tmpVal1;
+    bias_val = (bval - scale_vari * mean_vari.s0);
+    vxc_half8 dst;
+
+    for(; coord.y < endH; coord.y++)
+    {
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, in_h, src0, 16);
+
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        UniFP16toFP32Lo4_dp4x4);
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertEndInt16Fp32_4x4);
+    vxc_float4 norm;
+    norm = scale_vari * tmpData0 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = scale_vari * tmpData1 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+        uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8.vx
index 68f8f8a..4becc2b 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8.vx
@@ -5,7 +5,6 @@ _viv_uniform int height;
 _viv_uniform float dimRatio;
 _viv_uniform int group_num;
 _viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
 _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
 
 _viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
@@ -25,9 +24,7 @@ _viv_uniform float outputScale;
 _viv_uniform int output_ZP;
 
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_U8(
-    image2d_array_t input,
-    image2d_array_t output,
-        float eps, int rsFlg)
+    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)
 {
     int gidx = get_global_id(0) << 4;
     int lidx = get_local_id(0);
@@ -35,17 +32,20 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
     int4 coord = (int4)(gidx, 0, gidz, 0);
     vxc_uchar16 src0;
     float sum = 0, sqr = 0;
-    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;
+    int tmpSum = 0, tmpSqr = 0, tmpSum1 = 0, tmpSqr1 = 0;
 
     __local float lcl_sum[16];
     __local float lcl_sqr[16];
-
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
     if(gidx < width)
     {
         for(coord.y = 0; coord.y < height;)
         {
-            VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
             coord.y++;
             VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
             tmpSum += (tmpSum1);
@@ -55,7 +55,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
         sqr += (tmpSqr * e2InScale + rowSumScale);
         sum = (tmpSum + sumInZp) * input_scale;
     }
-
     lcl_sum[lidx] = sum;
     lcl_sqr[lidx] = sqr;
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -66,23 +65,19 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
         float4 one = (float4)(1, 1, 1, 1);
         __local float4* tmp_sum = (__local float4*)lcl_sum;
         __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-
         sum = 0; sqr = 0;
         for(int i = 0; i < 4; i++)
         {
             sum += dot(tmp_sum[i], one);
             sqr += dot(tmp_sqr[i], one);
         }
-
         float4 data = (float4)(sum, sqr, 0, 0);
         write_imagef(output, coord_out, data);
     }
 }
 
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_U8_2D(
-    image2d_array_t input,
-    image2d_array_t output,
-              float eps, int rsFlg)
+    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)
 {
     int gidx = get_global_id(0) << 4;
     int lidx = get_local_id(0);
@@ -93,17 +88,16 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
     vxc_uchar16 src0;
     float sum = 0, sqr = 0;
     int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;
+    int endH = gidy + height;
 
     __local float lcl_sum[16];
     __local float lcl_sqr[16];
-
-    int endH = gidy + height;
     if(gidx < width)
     {
         for(; coord.y < endH;)
         {
             VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),
-                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+                             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
             coord.y++;
             VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
             tmpSum += (tmpSum1);
@@ -113,7 +107,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
         sqr += (tmpSqr * e2InScale + rowSumScale);
         sum = (tmpSum + sumInZp) * input_scale;
     }
-
     lcl_sum[lidx] = sum;
     lcl_sqr[lidx] = sqr;
     barrier(CLK_LOCAL_MEM_FENCE);
@@ -124,192 +117,20 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
         float4 one = (float4)(1, 1, 1, 1);
         __local float4* tmp_sum = (__local float4*)lcl_sum;
         __local float4* tmp_sqr = (__local float4*)lcl_sqr;
-
         sum = 0; sqr = 0;
         for(int i = 0; i < 4; i++)
         {
             sum += dot(tmp_sum[i], one);
             sqr += dot(tmp_sqr[i], one);
         }
-
         float4 data = (float4)(sum, sqr, 0, 0);
         write_imagef(output, coord_out, data);
     }
 }
 
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16(
-    image2d_array_t input,
-    image2d_array_t bias,
-    image2d_array_t scale,
-    image2d_array_t meanVari,
-    image2d_array_t output,
-              float eps,
-              int rsFlg)
-{
-    int gidz = get_global_id(1);
-    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
-    int4 coord_para = (int4)(gidz, 0, 0, 0);
-    vxc_uchar16 src0;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
-
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        UniFP16toFP32Lo4_dp4x4);
-
-    bias_f = read_imagef(bias, coord_para);
-
-    coord_para.x = 0;
-    coord_para.y = gidz;
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_para);
-        coord_para.x += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    short zp = inputZP;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
-    vxc_short8 outval;
-    half4 tmpVal0, tmpVal1;
-    float alpha = input_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-    vxc_half8 dst;
-
-    for(coord.y = 0; coord.y < height;)
-    {
-    coord_para = coord;
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    coord.y++;
-
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvert1stUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvert2ndUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvert3rdUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvert4thUint8SubZpToFp32_4x4);
-    vxc_float4 norm;
-    norm = alpha * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-        uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    coord_para.x += 8;
-    norm = alpha * tmpData2 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData3 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-        uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    }
-}
-
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16_2D(
-    image2d_array_t input,
-    image2d_array_t bias,
-    image2d_array_t scale,
-    image2d_array_t meanVari,
-    image2d_array_t output,
-              float eps,
-              int rsFlg)
-{
-    int gidz = get_global_id(1);
-    int gidy = gidz * height;
-    int4 coord = (int4)(get_global_id(0), gidy, 0, 0);
-    int4 coord_para = (int4)(gidz, 0, 0, 0);
-    int endH = gidy + height;
-    vxc_uchar16 src0;
-    vxc_short8 src1;
-    vxc_half8 scale_h;
-    float scale_vari, bias_val;
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
-
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, scale_h, src1, 16);
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        UniFP16toFP32Lo4_dp4x4);
-
-    bias_f = read_imagef(bias, coord_para);
-
-    coord_para.x = 0;
-    coord_para.y = gidz;
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += read_imagef(meanVari, coord_para);
-        coord_para.x += 4;
-    }
-    mean_vari *= dimRatio;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    scale_vari = scale_f.s0 * mean_vari.s1;
-    short zp = inputZP;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
-    vxc_short8 outval;
-    half4 tmpVal0, tmpVal1;
-    float alpha = input_scale * scale_vari;
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
-    vxc_half8 dst;
-
-    for(; coord.y < endH;)
-    {
-    coord_para = coord;
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    coord.y++;
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvert1stUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvert2ndUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvert3rdUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvert4thUint8SubZpToFp32_4x4);
-    vxc_float4 norm;
-    norm = alpha * tmpData0 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData1 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-        uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    coord_para.x += 8;
-    norm = alpha * tmpData2 + bias_val;
-    _viv_asm(CONV, tmpVal0, norm);
-    norm = alpha * tmpData3 + bias_val;
-    _viv_asm(CONV, tmpVal1, norm);
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
-        uniConvertHalfToFp16_2x8);
-    _viv_asm(COPY, outval, dst, 16);
-    VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    }
-}
-
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toU8(
-    image2d_array_t input,
-    image2d_array_t bias,
-    image2d_array_t scale,
-    image2d_array_t meanVari,
-    image2d_array_t output,
-              float eps,
-              int rsFlg)
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int rsFlg)
 {
     int gidz = get_global_id(1);
     int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
@@ -326,12 +147,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to
     VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
 
     bias_f = read_imagef(bias, coord_para);
-    coord_para.x = 0;
-    coord_para.y = gidz;
     for(int i = 0; i < group_num; i++)
     {
-        mean_vari += read_imagef(meanVari, coord_para);
-        coord_para.x += 4;
+        mean_vari += read_imagef(meanVari, coord_para.yx);
+        coord_para.y += 4;
     }
     mean_vari *= dimRatio;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
@@ -340,47 +159,43 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to
     scale_vari = scale_f.s0 * mean_vari.s1;
     short zp = inputZP;
     vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
     float alpha = scale_inOut * scale_vari;
     bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;
 
-    for(coord.y = 0; coord.y < height;coord.y++)
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr);
+
+    for(coord.y = 0; coord.y < height; coord.y++)
     {
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvert1stUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvert2ndUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvert3rdUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvert4thUint8SubZpToFp32_4x4);
-    vxc_float4 norm;
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
     norm = tmpData0 * alpha + bias_val;
     tmpVal0 = convert_int4_rte(norm);
     norm = tmpData1 * alpha + bias_val;
     tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-        uniConvertInt32toUint8_2x8);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
     norm = tmpData2 * alpha + bias_val;
     tmpVal0 = convert_int4_rte(norm);
     norm = tmpData3 * alpha + bias_val;
     tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\
-        uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
     }
 }
 
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toU8_2D(
-    image2d_array_t input,
-    image2d_array_t bias,
-    image2d_array_t scale,
-    image2d_array_t meanVari,
-    image2d_array_t output,
-              float eps,
-              int rsFlg)
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int rsFlg)
 {
     int gidz = get_global_id(1);
     int gidy = gidz * height;
@@ -399,12 +214,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to
     VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
 
     bias_f = read_imagef(bias, coord_para);
-    coord_para.x = 0;
-    coord_para.y = gidz;
     for(int i = 0; i < group_num; i++)
     {
-        mean_vari += read_imagef(meanVari, coord_para);
-        coord_para.x += 4;
+        mean_vari += read_imagef(meanVari, coord_para.yx);
+        coord_para.y += 4;
     }
     mean_vari *= dimRatio;
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
@@ -413,35 +226,27 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to
     scale_vari = scale_f.s0 * mean_vari.s1;
     short zp = inputZP;
     vxc_int4 tmpVal0, tmpVal1;
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
     float alpha = scale_inOut * scale_vari;
     bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;
 
     for(; coord.y < endH; coord.y++)
     {
-    VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvert1stUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvert2ndUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvert3rdUint8SubZpToFp32_4x4);
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
-        uniConvert4thUint8SubZpToFp32_4x4);
-    vxc_float4 norm;
+    VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
     norm = tmpData0 * alpha + bias_val;
     tmpVal0 = convert_int4_rte(norm);
     norm = tmpData1 * alpha + bias_val;
     tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
-        uniConvertInt32toUint8_2x8);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
     norm = tmpData2 * alpha + bias_val;
     tmpVal0 = convert_int4_rte(norm);
     norm = tmpData3 * alpha + bias_val;
     tmpVal1 = convert_int4_rte(norm);
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\
-        uniConvertInt32toUint8_2x8);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
     VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
     }
 }
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8_f16.vx
new file mode 100644
index 0000000..9602d13
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_u8_f16.vx
@@ -0,0 +1,147 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform float dimRatio;
+_viv_uniform int group_num;
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
+
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
+_viv_uniform float input_scale;
+_viv_uniform int inputZP;
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16(
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int rsFlg)
+{
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);
+    int4 coord_para = (int4)(gidz, 0, 0, 0);
+    vxc_uchar16 src0;
+    vxc_short8 src1, outval;
+    vxc_half8 scale_h, dst;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
+
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, scale_h, src1, 16);
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
+    bias_f = read_imagef(bias, coord_para);
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_para.yx);
+        coord_para.y += 4;
+    }
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
+    half4 tmpVal0, tmpVal1;
+    float alpha = input_scale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
+
+    coord_para = coord;
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_para.z, baseAddr);
+    for(coord.y = 0; coord.y < height;)
+    {
+    VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    coord_para.xy = coord.xy;
+    coord.y++;
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
+    norm = alpha * tmpData0 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData1 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    coord_para.x += 8;
+    norm = alpha * tmpData2 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData3 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16_2D(
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int rsFlg)
+{
+    int gidz = get_global_id(1);
+    int gidy = gidz * height;
+    int4 coord = (int4)(get_global_id(0), gidy, 0, 0);
+    int4 coord_para = (int4)(gidz, 0, 0, 0);
+    int endH = gidy + height;
+    vxc_uchar16 src0;
+    vxc_short8 src1, outval;
+    vxc_half8 scale_h, dst;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);
+
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    _viv_asm(COPY, scale_h, src1, 16);
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);
+    bias_f = read_imagef(bias, coord_para);
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_para.yx);
+        coord_para.y += 4;
+    }
+    mean_vari *= dimRatio;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
+    half4 tmpVal0, tmpVal1;
+    float alpha = input_scale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
+    for(; coord.y < endH;)
+    {
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    coord_para = coord;
+    coord.y++;
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
+    norm = alpha * tmpData0 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData1 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    coord_para.x += 8;
+    norm = alpha * tmpData2 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData3 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32.vx
new file mode 100644
index 0000000..e39ef71
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32.vx
@@ -0,0 +1,275 @@
+#include "cl_viv_vx_ext.h"
+
+/**************************layernorm float16***********************************/
+_viv_uniform int width;
+_viv_uniform float dimRatio;
+_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;
+
+__kernel void layer_norm_F16F32toF16(
+    image2d_array_t input, image2d_t bias, image2d_t scale,
+    image2d_array_t output, float eps)
+{
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    vxc_short8 src0;
+    vxc_float sum = 0, sqr = 0;
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    Image img1 = create_image_from_image2d(bias, 4);
+    Image img2 = create_image_from_image2d(scale, 4);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    for(coord.x = 8; coord.x < (width+8); coord.x += 8)
+    {
+        vxc_half8  val0_h;
+        _viv_asm(COPY, val0_h, src0, 16);
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        vxc_float4 sumsqr;
+        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+            uniFp16SumSqr_dp8x2);
+        sum += sumsqr.x;
+        sqr += sumsqr.y;
+    }
+    vxc_float mean;
+    mean = sum * dimRatio;
+    vxc_float vari;
+    vari = sqr*dimRatio - mean*mean;
+    vari += eps;
+    vari = rsqrt(vari);
+    vxc_float4 bias_f, scale_f, in_f;
+    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww);
+    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww);
+    for(coord.x = 0; coord.x < width; coord.x += 4)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        bias_f = vload4(0, bias_ptr + coord.x);
+        scale_f = vload4(0, scale_ptr + coord.x);
+        vxc_half8 in_h;
+        _viv_asm(COPY, in_h, src0, 16);
+        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        vxc_float4 sub, norm;
+        sub = in_f - mean;
+        norm = scale_f * vari * sub + bias_f;
+        half4 norm_h;
+        _viv_asm(CONV, norm_h, norm);
+        vxc_half8 dst;
+        VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniExtractHalf4_dp4x4);
+        vxc_short8 dstval;
+        _viv_asm(COPY, dstval, dst, 16);
+        coord_out.x = coord.x;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dstval, \
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+    }
+}
+/*****************************layernorm uint8 to uint8****************************/
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniSumU8_16x1;
+_viv_uniform VXC_512Bits uniSqrSum_16x1;
+_viv_uniform float input_scale;
+_viv_uniform int inputZP;
+_viv_uniform float outputScale;
+_viv_uniform float output_zp;
+_viv_uniform int sumInZp;
+_viv_uniform int tmpZp1;
+_viv_uniform int tmpZp2;
+_viv_uniform float e2InScale;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;
+_viv_uniform float dimRatio_scale;
+
+__kernel void layer_norm_U8F32toU8(
+    image2d_array_t input, image2d_t bias, image2d_t scale,
+    image2d_array_t output, float eps)
+{
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    vxc_uchar16 src0, src2;
+    float sum = 0, sqr = 0;
+    vxc_float4 bias_f0, bias_f1, bias_f2, bias_f3, scale_f0, scale_f1, scale_f2, scale_f3;
+    int tmpSum = 0, tmpSqr = 0;
+    vxc_int4 tmpSum1;
+    vxc_int4 tmpSqr1;
+    short zp = inputZP;
+
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    for(coord.x = 0; coord.x < width; coord.x += 16)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
+        tmpSum += (tmpSum1.x);
+        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
+        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);
+    }
+    sum = (tmpSum + sumInZp) * input_scale;
+    sqr = (tmpSqr + tmpZp2) * e2InScale;
+
+    float mean, vari;
+    mean = sum * dimRatio;
+    vari = sqr*dimRatio - mean*mean;
+    vari += eps;
+    vari = rsqrt(vari);
+    vxc_int4 tmpVal0, tmpVal1;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
+
+    Image img1 = create_image_from_image2d(bias, 4);
+    Image img2 = create_image_from_image2d(scale, 4);
+    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww);
+    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww);
+    for(coord.x = 0; coord.x < width; coord.x += 16)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = vload4(0, bias_ptr);
+        bias_f1 = vload4(1, bias_ptr);
+        bias_f2 = vload4(2, bias_ptr);
+        bias_f3 = vload4(3, bias_ptr);
+        scale_f0 = vload4(0, scale_ptr);
+        scale_f1 = vload4(1, scale_ptr);
+        scale_f2 = vload4(2, scale_ptr);
+        scale_f3 = vload4(3, scale_ptr);
+        bias_ptr += 16;
+        scale_ptr += 16;
+
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert2ndUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert3rdUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert4thUint8SubZpToFp32_4x4);
+        tmpData0 *= input_scale;
+        tmpData1 *= input_scale;
+        tmpData2 *= input_scale;
+        tmpData3 *= input_scale;
+
+        vxc_float4 norm;
+        tmpData0 -= mean;
+        norm = scale_f0 * vari * tmpData0 + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+
+        tmpData1 -= mean;
+        norm = scale_f1 * vari * tmpData1 + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+            uniConvertInt32toUint8_2x8);
+
+        tmpData2 -= mean;
+        norm = scale_f2 * vari * tmpData2 + bias_f2;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+
+        tmpData3 -= mean;
+        norm = scale_f3 * vari * tmpData3 + bias_f3;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\
+            uniConvertInt32toUint8_2x8);
+        coord_out.x = coord.x;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \
+                VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel void layer_norm_I16F32toI16(
+    image2d_array_t input, image2d_t bias, image2d_t scale,
+    image2d_array_t output, float eps)
+{
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr);
+
+    vxc_short8 src0, dst;
+    vxc_float sum = 0, sqr = 0;
+    for(; coord.x < width;)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord.x += 8;
+        vxc_float4 sumsqr;
+        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+                    uniInt16SumSqr_dp8x2);
+        sum += sumsqr.x;
+        sqr = sqr + sumsqr.y * e2InScale;
+    }
+    vxc_float mean;
+    mean = sum * dimRatio_scale;
+    vxc_float vari;
+    vari = sqr*dimRatio - mean*mean;
+    vari += eps;
+    vari = rsqrt(vari);
+
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_int4 tmpVal0, tmpVal1;
+
+    int2 coord_bias = (int2)(0, 0);
+    Image img1 = create_image_from_image2d(bias, 4);
+    Image img2 = create_image_from_image2d(scale, 4);
+    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord_bias);
+    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord_bias);
+    for(coord.x = 0; coord.x < width; coord.x += 8)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = vload4(0, bias_ptr);
+        bias_f1 = vload4(1, bias_ptr);
+        scale_f0 = vload4(0, scale_ptr);
+        scale_f1 = vload4(1, scale_ptr);
+        bias_ptr += 8;
+        scale_ptr += 8;
+
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert2ndUint8SubZpToFp32_4x4);
+
+        vxc_float4 sub, norm;
+        sub = tmpData0 * input_scale - mean;
+        norm = scale_f0 * vari * sub + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+        sub = tmpData1 * input_scale - mean;
+        norm = scale_f1 * vari * sub + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniConvertInt32toUint8_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_2d.vx
new file mode 100644
index 0000000..8010726
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_2d.vx
@@ -0,0 +1,237 @@
+#include "cl_viv_vx_ext.h"
+
+/**************************layernorm float16***********************************/
+_viv_uniform int width;
+_viv_uniform float dimRatio;
+_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;
+
+__kernel void layer_norm_F16F32toF16_2D(
+    image2d_t input, image2d_t bias, image2d_t scale,
+    image2d_t output, float eps)
+{
+    int4 coord = (int4)(0, get_global_id(1), 0, 0);
+    vxc_short8 src0, src1;
+    vxc_float sum = 0, sqr = 0;
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    Image img1 = create_image_from_image2d(bias, 4);
+    Image img2 = create_image_from_image2d(scale, 4);
+
+    for(coord.x = 8; coord.x < (width+8); coord.x += 8)
+    {
+        vxc_half8  val0_h;
+        _viv_asm(COPY, val0_h, src0, 16);
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        vxc_float4 sumsqr;
+        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+            uniFp16SumSqr_dp8x2);
+        sum += sumsqr.x;
+        sqr += sumsqr.y;
+    }
+    vxc_float mean;
+    mean = sum * dimRatio;
+    vxc_float vari;
+    vari = sqr*dimRatio - mean*mean;
+    vari += eps;
+    vari = rsqrt(vari);
+    vxc_float4 bias_f, scale_f, in_f;
+    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw);
+    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw);
+    for(coord.x = 0; coord.x < width; coord.x += 4)
+    {
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+        bias_f = vload4(0, bias_ptr + coord.x);
+        scale_f = vload4(0, scale_ptr + coord.x);
+
+        vxc_half8 in_h;
+        _viv_asm(COPY, in_h, src0, 16);
+        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+        vxc_float4 sub, norm;
+        sub = in_f - mean;
+        norm = scale_f * vari * sub + bias_f;
+        half4 norm_h;
+        _viv_asm(CONV, norm_h, norm);
+        vxc_half8 dst;
+        VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniExtractHalf4_dp4x4);
+        vxc_short8 dstval;
+        _viv_asm(COPY, dstval, dst, 16);
+        VXC_WriteImage(output, coord.xy, dstval, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+    }
+}
+/*****************************layernorm uint8 to uint8****************************/
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniSumU8_16x1;
+_viv_uniform VXC_512Bits uniSqrSum_16x1;
+_viv_uniform float input_scale;
+_viv_uniform int inputZP;
+_viv_uniform float outputScale;
+_viv_uniform float output_zp;
+_viv_uniform int sumInZp;
+_viv_uniform int tmpZp1;
+_viv_uniform int tmpZp2;
+_viv_uniform float e2InScale;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;
+_viv_uniform float dimRatio_scale;
+
+__kernel void layer_norm_U8F32toU8_2D(
+    image2d_t input, image2d_t bias, image2d_t scale,
+    image2d_t output, float eps)
+{
+    int4 coord = (int4)(0, get_global_id(1), 0, 0);
+    vxc_uchar16 src0, src2;
+    float sum = 0, sqr = 0;
+    vxc_float4 bias_f0, bias_f1, bias_f2, bias_f3, scale_f0, scale_f1, scale_f2, scale_f3;
+    int tmpSum = 0, tmpSqr = 0;
+    vxc_int4 tmpSum1;
+    vxc_int4 tmpSqr1;
+    short zp = inputZP;
+
+    for(coord.x = 0; coord.x < width; coord.x += 16)
+    {
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);
+        tmpSum += (tmpSum1.x);
+        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);
+        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);
+    }
+    sum = (tmpSum + sumInZp) * input_scale;
+    sqr = (tmpSqr + tmpZp2) * e2InScale;
+
+    float mean, vari;
+    mean = sum * dimRatio;
+    vari = sqr*dimRatio - mean*mean;
+    vari += eps;
+    vari = rsqrt(vari);
+    vxc_int4 tmpVal0, tmpVal1;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;
+
+    Image img1 = create_image_from_image2d(bias, 4);
+    Image img2 = create_image_from_image2d(scale, 4);
+    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw);
+    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw);
+    for(coord.x = 0; coord.x < width; coord.x += 16)
+    {
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = vload4(0, bias_ptr);
+        bias_f1 = vload4(1, bias_ptr);
+        bias_f2 = vload4(2, bias_ptr);
+        bias_f3 = vload4(3, bias_ptr);
+        scale_f0 = vload4(0, scale_ptr);
+        scale_f1 = vload4(1, scale_ptr);
+        scale_f2 = vload4(2, scale_ptr);
+        scale_f3 = vload4(3, scale_ptr);
+        bias_ptr += 16;
+        scale_ptr += 16;
+
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert2ndUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert3rdUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvert4thUint8SubZpToFp32_4x4);
+        tmpData0 = tmpData0 * input_scale - mean;
+        tmpData1 = tmpData1 * input_scale - mean;
+        tmpData2 = tmpData2 * input_scale - mean;
+        tmpData3 = tmpData3 * input_scale - mean;
+
+        vxc_float4 norm;
+        norm = scale_f0 * vari * tmpData0 + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+
+        norm = scale_f1 * vari * tmpData1 + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+            uniConvertInt32toUint8_2x8);
+
+        norm = scale_f2 * vari * tmpData2 + bias_f2;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+
+        norm = scale_f3 * vari * tmpData3 + bias_f3;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\
+            uniConvertInt32toUint8_2x8);
+        VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel void layer_norm_I16F32toI16_2D(
+    image2d_t input, image2d_t bias, image2d_t scale,
+    image2d_t output, float eps)
+{
+    int4 coord = (int4)(0, get_global_id(1), 0, 0);
+
+    vxc_short8 src0, src1, dst;
+    vxc_float sum = 0, sqr = 0;
+    for(; coord.x < width;)
+    {
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord.x += 8;
+        vxc_float4 sumsqr;
+        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+                    uniInt16SumSqr_dp8x2);
+        sum += sumsqr.x;
+        sqr = sqr + sumsqr.y * e2InScale;
+    }
+    vxc_float mean, vari;
+    mean = sum * dimRatio_scale;
+    vari = sqr * dimRatio - mean * mean;
+    vari += eps;
+    vari = rsqrt(vari);
+
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_half8 scale_h;
+    vxc_int4 tmpVal0, tmpVal1;
+
+    Image img1 = create_image_from_image2d(bias, 4);
+    Image img2 = create_image_from_image2d(scale, 4);
+
+    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw);
+    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw);
+    for(coord.x = 0; coord.x < width; coord.x += 8)
+    {
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = vload4(0, bias_ptr);
+        bias_f1 = vload4(1, bias_ptr);
+        scale_f0 = vload4(0, scale_ptr);
+        scale_f1 = vload4(1, scale_ptr);
+        bias_ptr += 8;
+        scale_ptr += 8;
+
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert1stUint8SubZpToFp32_4x4);
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniConvert2ndUint8SubZpToFp32_4x4);
+
+        vxc_float4 sub, norm;
+        sub = tmpData0 * input_scale - mean;
+        norm = scale_f0 * vari * sub + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);
+        sub = tmpData1 * input_scale - mean;
+        norm = scale_f1 * vari * sub + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);
+
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                    uniConvertInt32toUint8_2x8);
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_bf16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_bf16.vx
new file mode 100644
index 0000000..76e3ed9
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_scale_f32_bf16.vx
@@ -0,0 +1,159 @@
+#include "cl_viv_vx_ext.h"
+
+/**************************layernorm float16***********************************/
+_viv_uniform int width;
+_viv_uniform float dimRatio;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+
+__kernel void layer_norm_BF16F32toBF16(
+    image2d_array_t input, image2d_t bias, image2d_t scale,
+    image2d_array_t output, float eps)
+{
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+    float4 ones = (float4)(1.0, 1.0, 1.0, 1.0);
+    vxc_ushort8 src0, src1, src2;
+    vxc_float sum = 0, sqr = 0;
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    Image img1 = create_image_from_image2d(bias, 4);
+    Image img2 = create_image_from_image2d(scale, 4);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    float4 srcA, srcB;
+    for(coord.x = 8; coord.x < (width+8); coord.x += 8)
+    {
+        VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniConvBF16toF32_Part0_2x8);
+        VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, srcA, src1, 16);
+        _viv_asm(COPY, srcB, src2, 16);
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        sum += dot(srcA, ones) + dot(srcB, ones);
+        sqr += dot(srcA * srcA, ones) + dot(srcB * srcB, ones);
+    }
+    vxc_float mean;
+    mean = sum * dimRatio;
+    vxc_float vari;
+    vari = sqr*dimRatio - mean*mean;
+    vari += eps;
+    vari = rsqrt(vari);
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww);
+    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww);
+
+    for(coord.x = 0; coord.x < width; coord.x += 8)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = vload4(0, bias_ptr);
+        bias_f1 = vload4(1, bias_ptr);
+        scale_f0 = vload4(0, scale_ptr);
+        scale_f1 = vload4(1, scale_ptr);
+        bias_ptr += 8;
+        scale_ptr += 8;
+
+        VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniConvBF16toF32_Part0_2x8);
+        VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, srcA, src1, 16);
+        _viv_asm(COPY, srcB, src2, 16);
+
+
+        vxc_float4 sub0, sub1, norm0, norm1;
+        sub0 = srcA - mean;
+        sub1 = srcB - mean;
+        norm0 = scale_f0 * vari * sub0 + bias_f0;
+        norm1 = scale_f1 * vari * sub1 + bias_f1;
+
+        _viv_asm(COPY, src0, norm0, 16);
+        _viv_asm(COPY, src1, norm1, 16);
+        VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+
+        coord_out.x = coord.x;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel void layer_norm_BF16F32toBF16_2D(
+    image2d_t input, image2d_t bias, image2d_t scale,
+    image2d_t output, float eps)
+{
+    int4 coord = (int4)(0, get_global_id(1), 0, 0);
+    vxc_ushort8 src0, src1, src2;
+    vxc_float sum = 0, sqr = 0;
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+    float4 ones = (float4)(1.0, 1.0, 1.0, 1.0);
+    Image img1 = create_image_from_image2d(bias, 4);
+    Image img2 = create_image_from_image2d(scale, 4);
+    float4 srcA, srcB;
+    for(coord.x = 8; coord.x < (width+8); coord.x += 8)
+    {
+        VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniConvBF16toF32_Part0_2x8);
+        VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, srcA, src1, 16);
+        _viv_asm(COPY, srcB, src2, 16);
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        sum += dot(srcA, ones) + dot(srcB, ones);
+        sqr += dot(srcA * srcA, ones) + dot(srcB * srcB, ones);
+    }
+    vxc_float mean;
+    mean = sum * dimRatio;
+    vxc_float vari;
+    vari = sqr*dimRatio - mean*mean;
+    vari += eps;
+    vari = rsqrt(vari);
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw);
+    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw);
+    for(coord.x = 0; coord.x < width; coord.x += 8)
+    {
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = vload4(0, bias_ptr);
+        bias_f1 = vload4(1, bias_ptr);
+        scale_f0 = vload4(0, scale_ptr);
+        scale_f1 = vload4(1, scale_ptr);
+        bias_ptr += 8;
+        scale_ptr += 8;
+
+        VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniConvBF16toF32_Part0_2x8);
+        VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                     uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, srcA, src1, 16);
+        _viv_asm(COPY, srcB, src2, 16);
+
+        vxc_float4 sub0, sub1, norm0, norm1;
+        sub0 = srcA - mean;
+        sub1 = srcB - mean;
+        norm0 = scale_f0 * vari * sub0 + bias_f0;
+        norm1 = scale_f1 * vari * sub1 + bias_f1;
+
+        _viv_asm(COPY, src0, norm0, 16);
+        _viv_asm(COPY, src1, norm1, 16);
+        VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+        VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/one_hot.vx b/src/tim/vx/internal/src/libnnext/ops/vx/one_hot.vx
new file mode 100644
index 0000000..6d3cd52
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/one_hot.vx
@@ -0,0 +1,205 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniDataConvert_0_4x4;
+_viv_uniform VXC_512Bits uniDataConvert_1_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform int depth;
+#define ONE_HOT_SH_IMPL(name0, name1, src_type, copy_type, dst_type) \
+__kernel void one_hot_##name0##to##name1 \
+    ( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+                 int             suffix_sz, \
+                 int             on_val, \
+                 int             off_val \
+    ) \
+{ \
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0); \
+ \
+    copy_type src; \
+    src_type  val; \
+ \
+    VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src, val, 16); \
+ \
+    int4 data0, data1; \
+    VXC_DP4x4(data0, src, src, \
+         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_0_4x4); \
+    VXC_DP4x4(data1, src, src, \
+         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_1_4x4); \
+ \
+    do \
+    { \
+        int4 d0 = data0 == coord.zzzz ? on_val : off_val; \
+        int4 d1 = data1 == coord.zzzz ? on_val : off_val; \
+ \
+        dst_type dst; \
+        VXC_DP2x8(dst, d0, d1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+ \
+        VXC_WriteImage2DArray(output, coord.xzyw, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+ \
+        coord.z ++; \
+    } while (coord.z < depth); \
+}
+ONE_HOT_SH_IMPL(F16, F16, vxc_ushort8, vxc_half8,  vxc_ushort8)
+ONE_HOT_SH_IMPL(F16, I16, vxc_ushort8, vxc_half8,  vxc_ushort8)
+ONE_HOT_SH_IMPL(F16, I8,  vxc_ushort8, vxc_half8,  vxc_uchar8)
+ONE_HOT_SH_IMPL(F16, U8,  vxc_ushort8, vxc_half8,  vxc_uchar8)
+ONE_HOT_SH_IMPL(I16, F16, vxc_short8,  vxc_short8, vxc_ushort8)
+ONE_HOT_SH_IMPL(I16, I16, vxc_short8,  vxc_short8, vxc_ushort8)
+ONE_HOT_SH_IMPL(I8,  F16, vxc_char8,   vxc_char8,  vxc_ushort8)
+ONE_HOT_SH_IMPL(I8,  I8,  vxc_char8,   vxc_char8,  vxc_uchar8)
+
+#define ONE_HOT_SH_IMPL_2D(name0, name1, src_type, copy_type, dst_type) \
+__kernel void one_hot_##name0##to##name1##_2D \
+    ( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+                 int             suffix_sz, \
+                 int             on_val, \
+                 int             off_val \
+    ) \
+{ \
+    int4 coord =  (int4)(get_global_id(0), 0, get_global_id(0), get_global_id(0)); \
+ \
+    copy_type src; \
+    src_type  val; \
+ \
+    VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src, val, 16); \
+ \
+    int4 data, data0, data1; \
+    VXC_DP4x4(data, src, src, \
+         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_0_4x4); \
+    int4 d4 = (int4)(0, 1, 2, 3); \
+ \
+    do \
+    { \
+        coord.zw = coord.xx + (int2)(0, 1); \
+        dst_type dst; \
+        data0 = data.xxxx == d4 ? on_val : off_val; \
+        data1 = data.yyyy == d4 ? on_val : off_val; \
+ \
+        VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+ \
+        VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0)); \
+        coord.zw = coord.zw + (int2)(2, 2); \
+ \
+        data0 = data.zzzz == d4 ? on_val : off_val; \
+        data1 = data.wwww == d4 ? on_val : off_val; \
+ \
+        VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+ \
+        VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0)); \
+        d4 += 4; \
+        coord.y += 4; \
+    } while (coord.y < depth); \
+}
+ONE_HOT_SH_IMPL_2D(F16, F16, vxc_ushort8, vxc_half8,  vxc_ushort8)
+ONE_HOT_SH_IMPL_2D(F16, I16, vxc_ushort8, vxc_half8,  vxc_ushort8)
+ONE_HOT_SH_IMPL_2D(F16, I8,  vxc_ushort8, vxc_half8,  vxc_uchar8)
+ONE_HOT_SH_IMPL_2D(F16, U8,  vxc_ushort8, vxc_half8,  vxc_uchar8)
+ONE_HOT_SH_IMPL_2D(I16, F16, vxc_short8,  vxc_short8, vxc_ushort8)
+ONE_HOT_SH_IMPL_2D(I16, I16, vxc_short8,  vxc_short8, vxc_ushort8)
+ONE_HOT_SH_IMPL_2D(I8,  F16, vxc_char8,   vxc_char8,  vxc_ushort8)
+ONE_HOT_SH_IMPL_2D(I8,  I8,  vxc_char8,   vxc_char8,  vxc_uchar8)
+
+_viv_uniform float input_scale;
+_viv_uniform float input_tail;
+#define ONE_HOT_ASYM_SH_IMPL(name0, name1, src_type, copy_type, dst_type) \
+__kernel void one_hot_##name0##to##name1 \
+    ( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+                 int             suffix_sz, \
+                 int             on_val, \
+                 int             off_val \
+    ) \
+{ \
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0); \
+ \
+    copy_type src; \
+    src_type  val; \
+ \
+    VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src, val, 16); \
+ \
+    int4 data0, data1; \
+    float4 v0, v1; \
+    VXC_DP4x4(v0, src, src, \
+         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_0_4x4); \
+    VXC_DP4x4(v1, src, src, \
+         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_1_4x4); \
+ \
+    data0 = convert_int4(v0 * input_scale + input_tail); \
+    data1 = convert_int4(v1 * input_scale + input_tail); \
+    do \
+    { \
+        int4 d0 = data0 == coord.zzzz ? on_val : off_val; \
+        int4 d1 = data1 == coord.zzzz ? on_val : off_val; \
+ \
+        dst_type dst; \
+        VXC_DP2x8(dst, d0, d1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+ \
+        VXC_WriteImage2DArray(output, coord.xzyw, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+ \
+        coord.z ++; \
+    } while (coord.z < depth); \
+}
+ONE_HOT_ASYM_SH_IMPL(U8,  F16, vxc_uchar8,  vxc_uchar8, vxc_ushort8)
+ONE_HOT_ASYM_SH_IMPL(U8,  U8,  vxc_uchar8,  vxc_uchar8, vxc_uchar8)
+
+#define ONE_HOT_ASYM_SH_IMPL_2D(name0, name1, src_type, copy_type, dst_type) \
+__kernel void one_hot_##name0##to##name1##_2D \
+    ( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+                 int             suffix_sz, \
+                 int             on_val, \
+                 int             off_val \
+    ) \
+{ \
+    int4 coord =  (int4)(get_global_id(0), 0, get_global_id(0), get_global_id(0)); \
+ \
+    copy_type src; \
+    src_type  val; \
+ \
+    VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src, val, 16); \
+ \
+    int4 data, data0, data1; \
+    float4 v0; \
+    VXC_DP4x4(v0, src, src, \
+         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_0_4x4); \
+    int4 d4 = (int4)(0, 1, 2, 3); \
+    data = convert_int4(v0 * input_scale + input_tail); \
+ \
+    do \
+    { \
+        coord.zw = coord.xx + (int2)(0, 1); \
+        dst_type dst; \
+        data0 = data.xxxx == d4 ? on_val : off_val; \
+        data1 = data.yyyy == d4 ? on_val : off_val; \
+ \
+        VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+ \
+        VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0)); \
+        coord.zw = coord.zw + (int2)(2, 2); \
+ \
+        data0 = data.zzzz == d4 ? on_val : off_val; \
+        data1 = data.wwww == d4 ? on_val : off_val; \
+ \
+        VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
+ \
+        VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0)); \
+        d4 += 4; \
+        coord.y += 4; \
+    } while (coord.y < depth); \
+}
+ONE_HOT_ASYM_SH_IMPL_2D(U8,  F16, vxc_uchar8,  vxc_uchar8, vxc_ushort8)
+ONE_HOT_ASYM_SH_IMPL_2D(U8,  U8,  vxc_uchar8,  vxc_uchar8, vxc_uchar8)
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy.vx
index 95a43ed..c200019 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_rgb_copy.vx
@@ -6,10 +6,16 @@ _viv_uniform int r_order;
 _viv_uniform int b_order;
 _viv_uniform VXC_512Bits uniExtractRtoF32_part0_4x4;
 _viv_uniform VXC_512Bits uniExtractRtoF32_part1_4x4;
+_viv_uniform VXC_512Bits uniExtractRtoF32_part2_4x4;
+_viv_uniform VXC_512Bits uniExtractRtoF32_part3_4x4;
 _viv_uniform VXC_512Bits uniExtractGtoF32_part0_4x4;
 _viv_uniform VXC_512Bits uniExtractGtoF32_part1_4x4;
+_viv_uniform VXC_512Bits uniExtractGtoF32_part2_4x4;
+_viv_uniform VXC_512Bits uniExtractGtoF32_part3_4x4;
 _viv_uniform VXC_512Bits uniExtractBtoF32_part0_4x4;
 _viv_uniform VXC_512Bits uniExtractBtoF32_part1_4x4;
+_viv_uniform VXC_512Bits uniExtractBtoF32_part2_4x4;
+_viv_uniform VXC_512Bits uniExtractBtoF32_part3_4x4;
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;
 
 #define IMAGE_PRE_PROCESS_COPY_16BITS(dst_name, dst_type, copy_type, convert_type) \
@@ -31,13 +37,14 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
 { \
     int2 coord      = (int2)(get_global_id(0) * 3, get_global_id(1)); \
  \
-    coord.xy += (int2) (*xOffset, *yOffset); \
+    coord.xy = coord.xy + (int2) (*xOffset * 3 + 16, *yOffset); \
     vxc_uchar16 src0, src1; \
     dst_type   dst0; \
     copy_type   dst; \
  \
-    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0), \
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(-16, 0), \
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \
         VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
  \
     f32Var *= outputScale; \
@@ -48,7 +55,7 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
     float4 tmp0, tmp1; \
     convert_type result0, result1; \
  \
-    VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \
+    VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \
     tmp0 = tmp0 * paramData.w - paramData.x; \
     tmp1 = tmp1 * paramData.w - paramData.x; \
@@ -59,7 +66,7 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
     VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
  \
     coord_out.z = 1; \
-    VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \
+    VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \
     tmp0 = tmp0 * paramData.w - paramData.y; \
     tmp1 = tmp1 * paramData.w - paramData.y; \
@@ -70,7 +77,7 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
     VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
  \
     coord_out.z = b_order; \
-    VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \
+    VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \
     tmp0 = tmp0 * paramData.w - paramData.z; \
     tmp1 = tmp1 * paramData.w - paramData.z; \
@@ -101,12 +108,16 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
     ) \
 { \
     int2 coord      = (int2)(get_global_id(0) * 3, get_global_id(1)); \
-    coord.xy += (int2) (*xOffset, *yOffset); \
-    vxc_uchar16 src0, src1; \
+    coord.xy = coord.xy + (int2) (*xOffset * 3 + 16, *yOffset); \
+    vxc_uchar16 src0, src1, src2; \
     dst_type dst; \
  \
-    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
-    VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0), \
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(-16, 0), \
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+        coord.x += 16; \
+    VXC_ReadImage(src2, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \
         VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
  \
     f32Var *= outputScale; \
@@ -117,35 +128,55 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \
     float4 tmp0, tmp1; \
     int4 result0, result1; \
  \
-    VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \
+    VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \
     tmp0 = tmp0 * paramData.w - paramData.x; \
     tmp1 = tmp1 * paramData.w - paramData.x; \
     result0 = convert_int4_rte(tmp0); \
     result1 = convert_int4_rte(tmp1); \
     VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
-    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
- \
+    VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part2_4x4); \
+    VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part3_4x4); \
+    tmp0 = tmp0 * paramData.w - paramData.x; \
+    tmp1 = tmp1 * paramData.w - paramData.x; \
+    result0 = convert_int4_rte(tmp0); \
+    result1 = convert_int4_rte(tmp1); \
+    VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
+    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
  \
     coord_out.z = 1; \
-    VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \
+    VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \
     tmp0 = tmp0 * paramData.w - paramData.y; \
     tmp1 = tmp1 * paramData.w - paramData.y; \
     result0 = convert_int4_rte(tmp0); \
     result1 = convert_int4_rte(tmp1); \
     VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
-    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part2_4x4); \
+    VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part3_4x4); \
+    tmp0 = tmp0 * paramData.w - paramData.y; \
+    tmp1 = tmp1 * paramData.w - paramData.y; \
+    result0 = convert_int4_rte(tmp0); \
+    result1 = convert_int4_rte(tmp1); \
+    VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
+    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
  \
     coord_out.z = b_order; \
-    VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \
+    VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \
     tmp0 = tmp0 * paramData.w - paramData.z; \
     tmp1 = tmp1 * paramData.w - paramData.z; \
     result0 = convert_int4_rte(tmp0); \
     result1 = convert_int4_rte(tmp1); \
     VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
-    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part2_4x4); \
+    VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part3_4x4); \
+    tmp0 = tmp0 * paramData.w - paramData.z; \
+    tmp1 = tmp1 * paramData.w - paramData.z; \
+    result0 = convert_int4_rte(tmp0); \
+    result1 = convert_int4_rte(tmp1); \
+    VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
+    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
 }
 IMAGE_PRE_PROCESS_COPY_8BITS(U8, vxc_uchar16)
 IMAGE_PRE_PROCESS_COPY_8BITS(I8, vxc_char16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx
index 951ee96..bce976c 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx
@@ -53,7 +53,7 @@ __kernel void pre_process_yuv420_copy_U8toU8(
     )
 {
     int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);
-    int4 pos1 = (int4)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1, 0, 0);
+    int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1);
     vxc_uchar16 Y;
     vxc_uchar8 U, V;
     vxc_int4 C0, C1, C2, C3;
@@ -132,3 +132,109 @@ __kernel void pre_process_yuv420_copy_U8toU8(
     VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
 }
 
+__kernel void pre_process_yuv420_copy_U8toF16(
+    __read_only image2d_t            y_img,
+    __read_only image2d_t            u_img,
+    __read_only image2d_t            v_img,
+    __write_only image2d_array_t    output,
+        global int *                xRatio,
+        global int *                yRatio,
+        global int *               xOffset,
+        global int *               yOffset,
+               float                 rMean,
+               float                 gMean,
+               float                 bMean,
+               float                   var,
+               int         reverse_channel,
+               int                   trans
+    )
+{
+    int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);
+    int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1);
+    vxc_uchar16 Y;
+    vxc_uchar8 U, V;
+    vxc_int4 C0, C1, C2, C3;
+    vxc_uchar16 R, G, B;
+    vxc_half8 dst0, dst1, dst2, dst3, dst4, dst5;
+    vxc_short8 out0, out1, out2, out3, out4, out5;
+
+    VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    //C = Y - 16;
+    //D = U - 128;
+    //E = V - 128;
+    // calculate R
+    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]
+    int tmpV = -56992;
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);
+
+    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
+    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
+    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
+    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
+
+    // calculate G
+    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]
+    // 298Y - 208V
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);
+    // 34784 - 100U
+    ushort tmpG = 34784;
+    vxc_ushort8 tmpDstG;
+    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);
+    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);
+    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);
+    VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4);
+    VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4);
+
+    // calculate B
+    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]
+    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);
+    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);
+    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);
+    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);
+    tmpV = -70688;
+    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
+    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
+    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
+    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
+
+    float4  paramData = (float4)(bMean * var, gMean * var,\
+        rMean * var, var);
+    half4 paramData_f16;
+    _viv_asm(CONV, paramData_f16, paramData);
+
+    VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);
+    VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);
+
+    VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);
+    VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);
+
+    VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);
+    VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);
+
+    _viv_asm(COPY, out0, dst0, 16);
+    _viv_asm(COPY, out1, dst1, 16);
+    _viv_asm(COPY, out2, dst2, 16);
+    _viv_asm(COPY, out3, dst3, 16);
+    _viv_asm(COPY, out4, dst4, 16);
+    _viv_asm(COPY, out5, dst5, 16);
+
+    pos = (int4)(get_global_id(0), get_global_id(1), bOrder, get_global_id(0) + 8);
+    VXC_WriteImage2DArray(output, pos.xyzz, out0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage2DArray(output, pos.wyzz, out1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    pos.z = 1;
+    VXC_WriteImage2DArray(output, pos.xyzz, out2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage2DArray(output, pos.wyzz, out3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    pos.z = rOrder;
+    VXC_WriteImage2DArray(output, pos.xyzz, out4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage2DArray(output, pos.wyzz, out5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx
index 20803c9..05f9973 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv444_copy_u8.vx
@@ -51,7 +51,7 @@ __kernel void pre_process_yuv444_copy_U8toU8(
                int                   trans
     )
 {
-    int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);
+    int2 pos = (int2)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset));
     vxc_uchar16 Y, U, V;
     vxc_int4 C0, C1, C2, C3;
     vxc_uchar16 R, G, B;
@@ -122,11 +122,116 @@ __kernel void pre_process_yuv444_copy_U8toU8(
     VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);
     VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);
 
-    pos = (int4)(get_global_id(0), get_global_id(1), 0, 0);
-    pos.z = bOrder;
-    VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    pos.z = 1;
-    VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    pos.z = rOrder;
-    VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    int4 pos1 = (int4)(get_global_id(0), get_global_id(1), bOrder, 0);
+    VXC_WriteImage2DArray(output, pos1, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    pos1.z = 1;
+    VXC_WriteImage2DArray(output, pos1, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    pos1.z = rOrder;
+    VXC_WriteImage2DArray(output, pos1, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void pre_process_yuv444_copy_U8toF16(
+    __read_only image2d_t            y_img,
+    __read_only image2d_t            u_img,
+    __read_only image2d_t            v_img,
+    __write_only image2d_array_t    output,
+        global int *                xRatio,
+        global int *                yRatio,
+        global int *               xOffset,
+        global int *               yOffset,
+               float                 rMean,
+               float                 gMean,
+               float                 bMean,
+               float                   var,
+               int         reverse_channel,
+               int                   trans
+    )
+{
+    int2 pos = (int2)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset));
+    vxc_uchar16 Y, U, V;
+    vxc_int4 C0, C1, C2, C3;
+    vxc_uchar16 R, G, B;
+    vxc_half8 dst0, dst1, dst2, dst3, dst4, dst5;
+    vxc_short8 out0, out1, out2, out3, out4, out5;
+
+    VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(U, u_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(V, v_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    //C = Y - 16;
+    //D = U - 128;
+    //E = V - 128;
+    // calculate R
+    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]
+    int tmpV = -56992;
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);
+
+    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
+    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
+    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
+    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
+
+    // calculate G
+    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]
+    // 298Y - 208V
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);
+    // 34784 - 100U
+    ushort tmpG = 34784;
+    vxc_ushort8 tmpDstG0, tmpDstG1;
+    VXC_DP2x8(tmpDstG0, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);
+    VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2_2x8);
+
+    VXC_DP4x4(G, C0, tmpDstG0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);
+    VXC_DP4x4(G, C1, tmpDstG0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);
+    VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);
+    VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);
+
+    // calculate B
+    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]
+    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);
+    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);
+    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);
+    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);
+    tmpV = -70688;
+    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
+    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
+    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
+    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
+
+    float4  paramData = (float4)(bMean * var, gMean * var,\
+        rMean * var, var);
+    half4 paramData_f16;
+    _viv_asm(CONV, paramData_f16, paramData);
+
+    VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);
+    VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);
+
+    VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);
+    VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);
+
+    VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);
+    VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);
+
+    _viv_asm(COPY, out0, dst0, 16);
+    _viv_asm(COPY, out1, dst1, 16);
+    _viv_asm(COPY, out2, dst2, 16);
+    _viv_asm(COPY, out3, dst3, 16);
+    _viv_asm(COPY, out4, dst4, 16);
+    _viv_asm(COPY, out5, dst5, 16);
+
+    int4 pos1 = (int4)(get_global_id(0), get_global_id(1), bOrder, get_global_id(0) + 8);
+    VXC_WriteImage2DArray(output, pos1.xyzz, out0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage2DArray(output, pos1.wyzz, out1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    pos1.z = 1;
+    VXC_WriteImage2DArray(output, pos1.xyzz, out2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage2DArray(output, pos1.wyzz, out3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    pos1.z = rOrder;
+    VXC_WriteImage2DArray(output, pos1.xyzz, out4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage2DArray(output, pos1.wyzz, out5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 }
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/repeat.vx b/src/tim/vx/internal/src/libnnext/ops/vx/repeat.vx
new file mode 100644
index 0000000..5898ea4
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/repeat.vx
@@ -0,0 +1,224 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniIntegralHorAcc_4x4;
+_viv_uniform VXC_512Bits uniExtract1to8Short_2x8;
+_viv_uniform int width;
+
+// workgroup size is 32
+__kernel void preprocess_start_idx(image2d_t input, image2d_t output)
+{
+    int lidx = get_local_id(0);
+    __local int lcl_sum[32];
+    __local int last_round[1];
+    Image img = create_image_from_image2d(input, 4);
+    Image dst = create_image_from_image2d(output, 4);
+    __global int* index_ptr = (__global int*)img.ptr + get_global_id(0);
+    __global int* output_org = (__global int*)dst.ptr;
+    __global int* output_ptr = output_org + get_global_id(0) + 1;
+
+    if (lidx == 0)
+    {
+        last_round[0] = 0;
+        output_org[0] = 0;
+    }
+    int4 accSum0, accSum1, accSum2, accSum3;
+
+    for(int i = 0; i < width; i += 512)
+    {
+        int4 data0 = vload4(0, index_ptr + i);
+        int4 data1 = vload4(1, index_ptr + i);
+        int4 data2 = vload4(2, index_ptr + i);
+        int4 data3 = vload4(3, index_ptr + i);
+        barrier(CLK_LOCAL_MEM_FENCE);
+        int prevSum = last_round[0];
+
+        VXC_DP4x4(accSum0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniIntegralHorAcc_4x4);
+        VXC_DP4x4(accSum1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniIntegralHorAcc_4x4);
+        VXC_DP4x4(accSum2, data2, data2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniIntegralHorAcc_4x4);
+        VXC_DP4x4(accSum3, data3, data3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniIntegralHorAcc_4x4);
+        accSum1 += accSum0.w;
+        accSum2 += accSum1.w;
+        accSum3 += accSum2.w;
+
+        lcl_sum[lidx] = accSum3.w;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        for(int j = 0; j < lidx; j++)
+        {
+            prevSum += lcl_sum[j];
+        }
+        accSum0 += prevSum;
+        accSum1 += prevSum;
+        accSum2 += prevSum;
+        accSum3 += prevSum;
+        if(lidx == 31)
+        {
+            last_round[0] = accSum3.w;
+        }
+        vstore4(accSum0, 0, output_ptr + i);
+        vstore4(accSum1, 1, output_ptr + i);
+        vstore4(accSum2, 2, output_ptr + i);
+        vstore4(accSum3, 3, output_ptr + i);
+    }
+}
+
+__kernel void repeat_I16_axis0(
+    image2d_array_t  input0, image2d_t  input1, image2d_t  input2,
+    image2d_array_t  output, int axis)
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    vxc_short8 src0;
+    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    Image img1 = create_image_from_image2d(input1, 4);
+    Image img2 = create_image_from_image2d(input2, 4);
+    __global int* len_ptr = (__global int*)img1.ptr;
+    __global int* index_ptr = (__global int*)img2.ptr;
+
+    int len = len_ptr[get_global_id(1)];
+    int start = index_ptr[get_global_id(1)];
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr);
+    int end = len + start;
+
+    for(coord.y = start; coord.y < end; coord.y++)
+    {
+        VXC_OP4_NoDest(img_store_3d, output, coord, src0, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel void repeat_I16_axis2(
+    image2d_array_t  input0, image2d_t  input1, image2d_t  input2,
+    image2d_array_t  output, int axis)
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    vxc_short8 src0;
+    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
+             VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    Image img1 = create_image_from_image2d(input1, 4);
+    Image img2 = create_image_from_image2d(input2, 4);
+    __global int* len_ptr = (__global int*)img1.ptr;
+    __global int* index_ptr = (__global int*)img2.ptr;
+
+    int len = len_ptr[get_global_id(2)];
+    int start = index_ptr[get_global_id(2)];
+    int end = len + start;
+
+    for(coord.z = start; coord.z < end; coord.z++)
+    {
+        VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
+
+#define REPEAT_1D(src0_type_name, data_type) \
+__kernel void repeat_##src0_type_name##_1D( \
+    image2d_t  input0, image2d_t  input1, image2d_t  input2, \
+    image2d_t  output, int axis) \
+{ \
+    int2 coord = (int2)(get_global_id(0), 0); \
+    data_type src0; \
+    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \
+             VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    Image img1 = create_image_from_image2d(input1, 4); \
+    Image img2 = create_image_from_image2d(input2, 4); \
+    __global int* len_ptr = (__global int*)img1.ptr; \
+    __global int* index_ptr = (__global int*)img2.ptr; \
+    int len = len_ptr[get_global_id(0)]; \
+    int start = index_ptr[get_global_id(0)]; \
+ \
+    int iter = len >> 3; \
+    int res = len & 7; \
+    int end = start + iter * 8; \
+    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8); \
+    for(coord.x = start; coord.x < end; coord.x+=8) \
+    { \
+        VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+ \
+    if(res == 7) \
+    { \
+        VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 6, 0, VXC_RM_TowardZero, 0)); \
+    } \
+    else if(res == 6) \
+    { \
+        VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \
+    } \
+    else if(res == 5) \
+    { \
+        VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 4, 0, VXC_RM_TowardZero, 0)); \
+    } \
+    else if(res == 4) \
+    { \
+        VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    } \
+    else if(res == 3) \
+    { \
+        VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0)); \
+    } \
+    else if(res == 2) \
+    { \
+        VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    } \
+    else if(res == 1) \
+    { \
+        VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+REPEAT_1D(U8,  vxc_uchar16)
+REPEAT_1D(I16, vxc_short8)
+
+__kernel void repeat_U8_axis0(
+    image2d_array_t  input0, image2d_t  input1, image2d_t  input2,
+    image2d_array_t  output, int axis)
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    vxc_uchar16 src0;
+    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
+             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    Image img1 = create_image_from_image2d(input1, 4);
+    Image img2 = create_image_from_image2d(input2, 4);
+    __global int* len_ptr = (__global int*)img1.ptr;
+    __global int* index_ptr = (__global int*)img2.ptr;
+
+    int len = len_ptr[get_global_id(1)];
+    int start = index_ptr[get_global_id(1)];
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr);
+    int end = len + start;
+
+    for(coord.y = start; coord.y < end; coord.y++)
+    {
+        VXC_OP4_NoDest(img_store_3d, output, coord, src0, \
+                VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel void repeat_U8_axis2(
+    image2d_array_t  input0, image2d_t  input1, image2d_t  input2,
+    image2d_array_t  output, int axis)
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    vxc_uchar16 src0;
+    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
+             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    Image img1 = create_image_from_image2d(input1, 4);
+    Image img2 = create_image_from_image2d(input2, 4);
+    __global int* len_ptr = (__global int*)img1.ptr;
+    __global int* index_ptr = (__global int*)img2.ptr;
+
+    int len = len_ptr[get_global_id(2)];
+    int start = index_ptr[get_global_id(2)];
+    int end = len + start;
+
+    for(coord.z = start; coord.z < end; coord.z++)
+    {
+        VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    }
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/repeat_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/repeat_axis1.vx
new file mode 100644
index 0000000..d22a292
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/repeat_axis1.vx
@@ -0,0 +1,232 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniExtract1to8Short_2x8;
+
+#define REPEAT_RES(end_pos) \
+coord.y = gidy; \
+VXC_OP4_NoDest(img_store_3d, output, coord, src0, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \
+coord.y++; \
+VXC_OP4_NoDest(img_store_3d, output, coord, src1, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \
+coord.y++; \
+VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \
+coord.y++; \
+VXC_OP4_NoDest(img_store_3d, output, coord, src3, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \
+coord.y++; \
+VXC_OP4_NoDest(img_store_3d, output, coord, src4, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \
+coord.y++; \
+VXC_OP4_NoDest(img_store_3d, output, coord, src5, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \
+coord.y++; \
+VXC_OP4_NoDest(img_store_3d, output, coord, src6, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \
+coord.y++; \
+VXC_OP4_NoDest(img_store_3d, output, coord, src7, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0));
+
+__kernel void repeat_I16_axis1(
+    image2d_array_t  input0, image2d_t  input1, image2d_t  input2,
+    image2d_array_t  output, int axis)
+{
+    int gidy = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), gidy, get_global_id(2), 0);
+    vxc_short8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
+
+    VXC_OP4(img_load_3d, src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input0, coord, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src2, input0, coord, VXC_5BITOFFSET_XY(0, 2),
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src3, input0, coord, VXC_5BITOFFSET_XY(0, 3),
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src4, input0, coord, VXC_5BITOFFSET_XY(0, 4),
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src5, input0, coord, VXC_5BITOFFSET_XY(0, 5),
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src6, input0, coord, VXC_5BITOFFSET_XY(0, 6),
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src7, input0, coord, VXC_5BITOFFSET_XY(0, 7),
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+    Image img1 = create_image_from_image2d(input1, 4);
+    Image img2 = create_image_from_image2d(input2, 4);
+    __global int* len_ptr = (__global int*)img1.ptr;
+    __global int* index_ptr = (__global int*)img2.ptr;
+
+    int len = len_ptr[get_global_id(0)];
+    int start = index_ptr[get_global_id(0)];
+
+    _viv_asm(MOV, coord.z, baseAddr);
+    int iter = len >> 3;
+    int res = len & 7;
+    coord.x = start;
+
+    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);
+    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);
+    VXC_DP2x8(src2, src2, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);
+    VXC_DP2x8(src3, src3, src3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);
+    VXC_DP2x8(src4, src4, src4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);
+    VXC_DP2x8(src5, src5, src5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);
+    VXC_DP2x8(src6, src6, src6, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);
+    VXC_DP2x8(src7, src7, src7, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);
+
+    for(int i = 0; i < iter; i++)
+    {
+        coord.y = gidy;
+        VXC_OP4_NoDest(img_store_3d, output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+        coord.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+        coord.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+        coord.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+        coord.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord, src4, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+        coord.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord, src5, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+        coord.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord, src6, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+        coord.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord, src7, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+        coord.x += 8;
+    }
+
+    if(res == 7)
+    {
+        REPEAT_RES(6)
+    }
+    else if(res == 6)
+    {
+        REPEAT_RES(5)
+    }
+    else if(res == 5)
+    {
+        REPEAT_RES(4)
+    }
+    else if(res == 4)
+    {
+        REPEAT_RES(3)
+    }
+    else if(res == 3)
+    {
+        REPEAT_RES(2)
+    }
+    else if(res == 2)
+    {
+        REPEAT_RES(1)
+    }
+    else if(res == 1)
+    {
+        REPEAT_RES(0)
+    }
+}
+
+__kernel void repeat_U8_axis1(
+    image2d_array_t  input0, image2d_t  input1, image2d_t  input2,
+    image2d_array_t  output, int axis)
+{
+    int gidy = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), gidy, get_global_id(2), 0);
+    vxc_uchar16 src0, src1, src2, src3, src4, src5, src6, src7;
+
+    int8 input_desc, output_desc;
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc));
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr_a);
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;
+
+    VXC_OP4(img_load_3d, src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),
+             VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input0, coord, VXC_5BITOFFSET_XY(0, 1),
+             VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src2, input0, coord, VXC_5BITOFFSET_XY(0, 2),
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src3, input0, coord, VXC_5BITOFFSET_XY(0, 3),
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src4, input0, coord, VXC_5BITOFFSET_XY(0, 4),
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src5, input0, coord, VXC_5BITOFFSET_XY(0, 5),
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src6, input0, coord, VXC_5BITOFFSET_XY(0, 6),
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src7, input0, coord, VXC_5BITOFFSET_XY(0, 7),
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+
+    Image img1 = create_image_from_image2d(input1, 4);
+    Image img2 = create_image_from_image2d(input2, 4);
+    __global int* len_ptr = (__global int*)img1.ptr;
+    __global int* index_ptr = (__global int*)img2.ptr;
+
+    int len = len_ptr[get_global_id(0)];
+    int start = index_ptr[get_global_id(0)];
+
+    _viv_asm(MOV, coord.z, baseAddr);
+    int iter = len >> 3;
+    int res = len & 7;
+    coord.x = start;
+
+    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);
+    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);
+    VXC_DP2x8(src2, src2, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);
+    VXC_DP2x8(src3, src3, src3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);
+    VXC_DP2x8(src4, src4, src4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);
+    VXC_DP2x8(src5, src5, src5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);
+    VXC_DP2x8(src6, src6, src6, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);
+    VXC_DP2x8(src7, src7, src7, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);
+
+    for(int i = 0; i < iter; i++)
+    {
+        coord.y = gidy;
+        VXC_OP4_NoDest(img_store_3d, output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+        coord.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+        coord.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+        coord.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+        coord.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord, src4, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+        coord.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord, src5, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+        coord.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord, src6, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+        coord.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord, src7, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+        coord.x += 8;
+    }
+
+    if(res == 7)
+    {
+        REPEAT_RES(6)
+    }
+    else if(res == 6)
+    {
+        REPEAT_RES(5)
+    }
+    else if(res == 5)
+    {
+        REPEAT_RES(4)
+    }
+    else if(res == 4)
+    {
+        REPEAT_RES(3)
+    }
+    else if(res == 3)
+    {
+        REPEAT_RES(2)
+    }
+    else if(res == 2)
+    {
+        REPEAT_RES(1)
+    }
+    else if(res == 1)
+    {
+        REPEAT_RES(0)
+    }
+}
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_UP_2X.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_UP_2X.vx
deleted file mode 100644
index 25f9350..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_UP_2X.vx
+++ /dev/null
@@ -1,65 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniResize2xUp_4x8;
-_viv_uniform VXC_512Bits uniResize2xUpRound_2x8;
-_viv_uniform int out_height;
-
-__kernel void resize_bilinear_U8toU8_UP_2X_half
-    (
-    __read_only  image2d_array_t   input,
-    __write_only image2d_array_t   output,
-                             int   align_corners,
-                             int   half_pixel_centers
-    )
-{
-    int4 coord_out  =  (int4)(get_global_id(0), 0, get_global_id(1), 0);
-    int4 coord_in   = (int4)(get_global_id(0), -1, get_global_id(1), 0);
-    coord_in.x = (coord_out.x * 2 - 1) >> 2;
-    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;
-
-    vxc_uchar16 in0, in1, tmp, result;
-    vxc_ushort8 result_s, round_s = 8;
-
-    int8 input_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord_in.w, baseAddr);
-    VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    int8 output_desc;
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord_out.w, baseAddr);
-
-    while (coord_out.y < out_height)
-    {
-        VXC_DP4x8(result_s, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8);
-        VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8);
-        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,
-            VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-        coord_out.y++;
-        VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 2),
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-        VXC_DP4x8(result_s, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8);
-        VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8);
-        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,
-            VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-        coord_out.y++;
-        VXC_DP4x8(result_s, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8);
-        VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8);
-        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,
-            VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-        VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 3),
-            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-        coord_out.y++;
-        VXC_DP4x8(result_s, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8);
-        VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8);
-        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,
-            VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
-        coord_in.y += 2;
-        coord_out.y++;
-    }
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers.vx
new file mode 100644
index 0000000..1c1071d
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8_half_pixel_centers.vx
@@ -0,0 +1,229 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniResize2xUp_0_4x8;
+_viv_uniform VXC_512Bits uniResize2xUp_1_4x8;
+_viv_uniform int out_height;
+
+__kernel void resize_bilinear_U8toU8_SAME_2x_upsample_half_pixel_centers
+    (
+    __read_only  image2d_array_t   input,
+    __write_only image2d_array_t   output,
+                             int   align_corners,
+                             int   half_pixel_centers
+    )
+{
+    int4 coord_out  =  (int4)(get_global_id(0), 0, get_global_id(1), 0);
+    int4 coord_in   = (int4)(get_global_id(0), -1, get_global_id(1), 0);
+    coord_in.x = (coord_out.x * 2 - 1) >> 2;
+    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;
+
+    vxc_uchar16 in0, in1, tmp, result;
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    while (coord_out.y < out_height)
+    {
+        VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
+        VXC_DP4x8(result, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 2),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
+        VXC_DP4x8(result, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(result, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
+        VXC_DP4x8(result, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 3),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(result, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);
+        VXC_DP4x8(result, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_in.y += 2;
+        coord_out.y++;
+    }
+}
+
+_viv_uniform VXC_512Bits uniResize4xUp_l00_4x8;
+_viv_uniform VXC_512Bits uniResize4xUp_l01_4x8;
+_viv_uniform VXC_512Bits uniResize4xUp_l10_4x8;
+_viv_uniform VXC_512Bits uniResize4xUp_l11_4x8;
+__kernel void resize_bilinear_U8toU8_SAME_4x_upsample_half_pixel_centers
+    (
+    __read_only  image2d_array_t   input,
+    __write_only image2d_array_t   output,
+                             int   align_corners,
+                             int   half_pixel_centers
+    )
+{
+    int4 coord_out  =  (int4)(get_global_id(0), 0, get_global_id(1), 0);
+    int4 coord_in   = (int4)(get_global_id(0), -1, get_global_id(1), 0);
+    coord_in.x = (coord_out.x * 2 - 3) >> 3;
+    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;
+
+    vxc_uchar16 in0, in1, tmp, dst0, dst1;
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    while (coord_out.y < out_height)
+    {
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
+        VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
+        VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
+        VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 2),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
+        VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l00_4x8);
+        VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l00_4x8);
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
+        VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l10_4x8);
+        VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
+        VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 3),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l10_4x8);
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);
+        VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l00_4x8);
+        VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_out.y++;
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+        coord_in.y += 2;
+        coord_out.y++;
+    }
+}
+
+_viv_uniform VXC_512Bits uniResize3xUp_l00_2x8;
+_viv_uniform VXC_512Bits uniResize3xUp_l01_2x8;
+_viv_uniform VXC_512Bits uniResize3xUp_l10_4x4;
+_viv_uniform VXC_512Bits uniResize3xUp_l11_4x4;
+_viv_uniform VXC_512Bits uniResize3xUp_l12_4x4;
+_viv_uniform VXC_512Bits uniResize3xUp_l13_4x4;
+__kernel void resize_bilinear_U8toU8_SAME_3x_upsample_half_pixel_centers
+    (
+    __read_only  image2d_array_t   input,
+    __write_only image2d_array_t   output,
+                             int   align_corners,
+                             int   half_pixel_centers
+    )
+{
+    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_in   = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    coord_in.x = (short)(coord_out.x * 2 - 1) / (short)6;
+    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;
+    coord_in.y = (short)(coord_out.y * 2 - 1) / (short)6;
+    coord_in.y  = coord_out.y == 0 ? -1 : coord_in.y;
+
+    vxc_uchar16 in0, in1, in2, in3, tmp, dst0, dst1, dst2;
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, in2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 2),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, in3, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 3),
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    VXC_DP4x4(dst2, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);
+    VXC_DP4x4(dst2, in1, in0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);
+    VXC_DP4x4(dst2, in1, in0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);
+    VXC_DP4x4(dst2, in1, in0, VXC_MODIFIER(12, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst2,
+        VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));
+    coord_out.y++;
+
+    VXC_DP2x8(dst0, in1, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l00_2x8);
+    VXC_DP2x8(dst0, in1, in1, VXC_MODIFIER(8, 14, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l01_2x8);
+    VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);
+    VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);
+    VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);
+    VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(12, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
+    VXC_DP4x4(dst2, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);
+    VXC_DP4x4(dst2, in2, in1, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);
+    VXC_DP4x4(dst2, in2, in1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);
+    VXC_DP4x4(dst2, in2, in1, VXC_MODIFIER(12, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,
+        VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,
+        VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst2,
+        VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));
+    coord_out.y++;
+
+    VXC_DP2x8(dst0, in2, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l00_2x8);
+    VXC_DP2x8(dst0, in2, in2, VXC_MODIFIER(8, 14, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l01_2x8);
+    VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);
+    VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);
+    VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);
+    VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(12, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,
+        VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,
+        VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/sequence_mask.vx b/src/tim/vx/internal/src/libnnext/ops/vx/sequence_mask.vx
new file mode 100644
index 0000000..3193485
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/sequence_mask.vx
@@ -0,0 +1,150 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+_viv_uniform float input_scale;
+_viv_uniform int inputZP;
+_viv_uniform int output_ZP;
+_viv_uniform float outputVal1;
+
+#define SEQUENCE_MASK_QINT_TO_QINT_2D(src0_type_name, src1_type_name, read_type, write_type) \
+__kernel void sequence_mask_##src0_type_name##to##src1_type_name##_2D( \
+    image2d_t input, image2d_t output, int maxLen) \
+{ \
+    int gidx = get_global_id(0); \
+    int2 coord = (int2)(gidx, get_global_id(1)); \
+    read_type src0; \
+    VXC_ReadImage(src0, input, coord.yy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3); \
+    float4 tmpData; \
+    short zp = inputZP; \
+    VXC_DP4x4(tmpData, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                 uniConvert1stUint8SubZpToFp32_4x4); \
+    int index = convert_int_rte(tmpData.s0 * input_scale); \
+    int4 data; \
+    data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; \
+    write_type dst; \
+    VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \
+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+SEQUENCE_MASK_QINT_TO_QINT_2D(U8,  U8,  vxc_uchar16, vxc_uchar16)
+SEQUENCE_MASK_QINT_TO_QINT_2D(I8,  I8,  vxc_char16,  vxc_char16)
+SEQUENCE_MASK_QINT_TO_QINT_2D(I16, I16, vxc_short8,  vxc_short8)
+SEQUENCE_MASK_QINT_TO_QINT_2D(I8,  U8,  vxc_char16,  vxc_uchar16)
+SEQUENCE_MASK_QINT_TO_QINT_2D(I16, U8,  vxc_short8,  vxc_uchar16)
+
+#define SEQUENCE_MASK_QINT_TO_QINT(src0_type_name, src1_type_name, read_type, write_type) \
+__kernel void sequence_mask_##src0_type_name##to##src1_type_name( \
+    image2d_t input, image2d_array_t output, int maxLen) \
+{ \
+    int gidx = get_global_id(0); \
+    int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0); \
+    read_type src0; \
+    VXC_ReadImage(src0, input, coord.yz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3); \
+    float4 tmpData; \
+    short zp = inputZP; \
+    VXC_DP4x4(tmpData, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                 uniConvert1stUint8SubZpToFp32_4x4); \
+    int index = convert_int_rte(tmpData.s0 * input_scale); \
+    int4 data; \
+    data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; \
+    write_type dst; \
+    VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \
+    VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+SEQUENCE_MASK_QINT_TO_QINT(U8,  U8,  vxc_uchar16, vxc_uchar16)
+SEQUENCE_MASK_QINT_TO_QINT(I8,  I8,  vxc_char16,  vxc_char16)
+SEQUENCE_MASK_QINT_TO_QINT(I16, I16, vxc_short8,  vxc_short8)
+SEQUENCE_MASK_QINT_TO_QINT(I16, U8,  vxc_short8,  vxc_uchar16)
+SEQUENCE_MASK_QINT_TO_QINT(I8,  U8,  vxc_char16,  vxc_uchar16)
+
+__kernel void sequence_mask_F16toF16_2D(
+    image2d_t input, image2d_t output, int maxLen)
+{
+    int gidx = get_global_id(0);
+    int2 coord = (int2)(gidx, get_global_id(1));
+    vxc_short8 src0;
+    vxc_half8 in_h;
+    VXC_ReadImage(src0, input, coord.yy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3);
+    _viv_asm(COPY, in_h, src0, 16);
+    float4 tmpData;
+    VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+    int index = convert_int_rte(tmpData.x);
+    float4 data;
+    data = outIdx < index? outputVal1 : convert_float(output_ZP);
+    vxc_short8 dst;
+    half4 tmpVal;
+    _viv_asm(CONV, tmpVal, data);
+    _viv_asm(COPY, dst, tmpVal, 16);
+    VXC_WriteImage(output, coord, dst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void sequence_mask_F16toF16(
+    image2d_t input, image2d_t output, int maxLen)
+{
+    int gidx = get_global_id(0);
+    int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0);
+    vxc_short8 src0;
+    vxc_half8 in_h;
+    VXC_ReadImage(src0, input, coord.yz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3);
+    _viv_asm(COPY, in_h, src0, 16);
+    float4 tmpData;
+    VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+    int index = convert_int_rte(tmpData.x);
+    float4 data;
+    data = outIdx < index? outputVal1 : convert_float(output_ZP);
+    vxc_short8 dst;
+    half4 tmpVal;
+    _viv_asm(CONV, tmpVal, data);
+    _viv_asm(COPY, dst, tmpVal, 16);
+    VXC_WriteImage2DArray(output, coord, dst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void sequence_mask_F16toU8_2D(
+    image2d_t input, image2d_t output, int maxLen)
+{
+    int gidx = get_global_id(0);
+    int2 coord = (int2)(gidx, get_global_id(1));
+    vxc_short8 src0;
+    vxc_half8 in_h;
+    VXC_ReadImage(src0, input, coord.yy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3);
+    _viv_asm(COPY, in_h, src0, 16);
+    float4 tmpData;
+    VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+    int index = convert_int_rte(tmpData.x);
+    int4 data;
+    data = outIdx < index? convert_int_rte(outputVal1) : output_ZP;
+    vxc_uchar16 dst;
+    VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void sequence_mask_F16toU8(
+    image2d_t input, image2d_t output, int maxLen)
+{
+    int gidx = get_global_id(0);
+    int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0);
+    vxc_short8 src0;
+    vxc_half8 in_h;
+    VXC_ReadImage(src0, input, coord.yz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
+    int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3);
+    _viv_asm(COPY, in_h, src0, 16);
+    float4 tmpData;
+    VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+    int index = convert_int_rte(tmpData.x);
+    int4 data;
+    data = outIdx < index? convert_int_rte(outputVal1) : output_ZP;
+    vxc_uchar16 dst;
+    VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/slice.vx b/src/tim/vx/internal/src/libnnext/ops/vx/slice.vx
new file mode 100644
index 0000000..5717266
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/slice.vx
@@ -0,0 +1,239 @@
+#include "cl_viv_vx_ext.h"
+
+#define SLICE_SAMLEFL_SH_IMPL(name, data_type, end_bin) \
+__kernel void slice_##name##_I32to##name##_SAMEFL \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_t       input1, \
+    __write_only image2d_array_t output, \
+    int is_samefl \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    int4 coord_in; \
+    Image begin_img = create_image_from_image2d(input1, 4); \
+    uchar* begin_ptr = begin_img.ptr; \
+    int4 begin = ((int4 *)begin_ptr)[0]; \
+    \
+    coord_in = coord + begin; \
+    data_type src; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, end_bin, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, end_bin, 0, VXC_RM_TowardZero, 0)); \
+}
+SLICE_SAMLEFL_SH_IMPL(U8, vxc_uchar16, 15)
+SLICE_SAMLEFL_SH_IMPL(I16, vxc_short8, 7)
+
+
+#define SLICE_SAMLEFL_2D_SH_IMPL(name, data_type, end_bin) \
+__kernel void slice_##name##_I32to##name##_SAMEFL_2D \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_t       input1, \
+    __write_only image2d_array_t output, \
+    int is_samefl \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+    int2 coord_in; \
+    Image begin_img = create_image_from_image2d(input1, 4); \
+    uchar* begin_ptr = begin_img.ptr; \
+    int2 begin = ((int2 *)begin_ptr)[0]; \
+    \
+    coord_in = coord + begin; \
+    data_type src; \
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, end_bin, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, end_bin, 0, VXC_RM_TowardZero, 0)); \
+}
+SLICE_SAMLEFL_2D_SH_IMPL(U8, vxc_uchar16, 15)
+SLICE_SAMLEFL_2D_SH_IMPL(I16, vxc_short8, 7)
+
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_Lo_2x8;
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_Hi_2x8;
+_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp
+#define SLICE_8BITSTO16BITS(name0, name1, src_type, dst_type, save_type) \
+__kernel void slice_##name0##_I32to##name1 \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_t       input1, \
+    __write_only image2d_array_t output, \
+    int is_samefl \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    src_type src; \
+    dst_type dst0; \
+    int4 coord_in; \
+    Image begin_img = create_image_from_image2d(input1, 4); \
+    uchar* begin_ptr = begin_img.ptr; \
+    int4 begin = ((int4 *)begin_ptr)[0]; \
+    \
+    coord_in = coord + begin; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_ushort8 multiplier; \
+    _viv_asm(COPY, multiplier, multAndoutZP, 16); \
+    VXC_DP2x8(dst0, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift_Lo_2x8); \
+    save_type dst; \
+    _viv_asm(COPY, dst, dst0, 16); \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+SLICE_8BITSTO16BITS(I8, F16, vxc_char16,  vxc_half8,  vxc_short8)
+SLICE_8BITSTO16BITS(U8, F16, vxc_uchar16, vxc_half8,  vxc_short8)
+
+#define SLICE_8BITSTO16BITS_2D(name0, name1, src_type, dst_type, save_type) \
+__kernel void slice_##name0##_I32to##name1##_2D \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_t       input1, \
+    __write_only image2d_array_t output, \
+    int is_samefl \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+    src_type src; \
+    dst_type dst0; \
+    int2 coord_in; \
+    Image begin_img = create_image_from_image2d(input1, 4); \
+    uchar* begin_ptr = begin_img.ptr; \
+    int2 begin = ((int2 *)begin_ptr)[0]; \
+    \
+    coord_in = coord + begin; \
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_ushort8 multiplier; \
+    _viv_asm(COPY, multiplier, multAndoutZP, 16); \
+    VXC_DP2x8(dst0, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift_Lo_2x8); \
+    save_type dst; \
+    _viv_asm(COPY, dst, dst0, 16); \
+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+SLICE_8BITSTO16BITS_2D(I8, F16, vxc_char16,  vxc_half8,  vxc_short8)
+SLICE_8BITSTO16BITS_2D(U8, F16, vxc_uchar16, vxc_half8,  vxc_short8)
+
+#define SLICE_8BITSTO8BITS(name0, name1, src_type, dst_type) \
+__kernel void slice_##name0##_I32to##name1 \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_t       input1, \
+    __write_only image2d_array_t output, \
+    int is_samefl \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    src_type src; \
+    dst_type dst; \
+    int4 coord_in; \
+    Image begin_img = create_image_from_image2d(input1, 4); \
+    uchar* begin_ptr = begin_img.ptr; \
+    int4 begin = ((int4 *)begin_ptr)[0]; \
+    \
+    coord_in = coord + begin; \
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_ushort8 multiplier; \
+    _viv_asm(COPY, multiplier, multAndoutZP, 16); \
+    VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift_Lo_2x8); \
+    VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift_Hi_2x8); \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
+}
+SLICE_8BITSTO8BITS(I8, I8, vxc_char16,  vxc_char16)
+SLICE_8BITSTO8BITS(U8, U8, vxc_uchar16, vxc_uchar16)
+
+#define SLICE_8BITSTO8BITS_2D(name0, name1, src_type, dst_type) \
+__kernel void slice_##name0##_I32to##name1##_2D \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_t       input1, \
+    __write_only image2d_array_t output, \
+    int is_samefl \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+    src_type src; \
+    dst_type dst; \
+    int2 coord_in; \
+    Image begin_img = create_image_from_image2d(input1, 4); \
+    uchar* begin_ptr = begin_img.ptr; \
+    int2 begin = ((int2 *)begin_ptr)[0]; \
+    \
+    coord_in = coord + begin; \
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_ushort8 multiplier; \
+    _viv_asm(COPY, multiplier, multAndoutZP, 16); \
+    VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift_Lo_2x8); \
+    VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift_Hi_2x8); \
+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
+}
+SLICE_8BITSTO8BITS_2D(I8, I8, vxc_char16,  vxc_char16)
+SLICE_8BITSTO8BITS_2D(U8, U8, vxc_uchar16, vxc_uchar16)
+
+#define SLICE_16BITS_TO(name0, name1, src_type, copy_type, dst_type) \
+__kernel void slice_##name0##_I32to##name1 \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_t       input1, \
+    __write_only image2d_array_t output, \
+    int is_samefl \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    src_type src; \
+    copy_type src0; \
+    dst_type dst; \
+    int4 coord_in; \
+    Image begin_img = create_image_from_image2d(input1, 4); \
+    uchar* begin_ptr = begin_img.ptr; \
+    int4 begin = ((int4 *)begin_ptr)[0]; \
+    \
+    coord_in = coord + begin; \
+    VXC_ReadImage2DArray(src0, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src, src0, 16); \
+ \
+    vxc_ushort8 multiplier; \
+    _viv_asm(COPY, multiplier, multAndoutZP, 16); \
+    VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift_Lo_2x8); \
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+SLICE_16BITS_TO(F16, I8,  vxc_half8,  vxc_short8, vxc_char16)
+SLICE_16BITS_TO(F16, U8,  vxc_half8,  vxc_short8, vxc_uchar16)
+SLICE_16BITS_TO(F16, I16, vxc_half8,  vxc_short8, vxc_short8)
+
+#define SLICE_16BITS_TO_2D(name0, name1, src_type, copy_type, dst_type) \
+__kernel void slice_##name0##_I32to##name1##_2D \
+    ( \
+    __read_only  image2d_array_t input0, \
+    __read_only  image2d_t       input1, \
+    __write_only image2d_array_t output, \
+    int is_samefl \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+    src_type src; \
+    copy_type src0; \
+    dst_type dst; \
+    int2 coord_in; \
+    Image begin_img = create_image_from_image2d(input1, 4); \
+    uchar* begin_ptr = begin_img.ptr; \
+    int2 begin = ((int2 *)begin_ptr)[0]; \
+    \
+    coord_in = coord + begin; \
+    VXC_ReadImage(src0, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src, src0, 16); \
+ \
+    vxc_ushort8 multiplier; \
+    _viv_asm(COPY, multiplier, multAndoutZP, 16); \
+    VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+        uniU8MulAndPostShift_Lo_2x8); \
+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+}
+SLICE_16BITS_TO_2D(F16, I8,  vxc_half8,  vxc_short8, vxc_char16)
+SLICE_16BITS_TO_2D(F16, U8,  vxc_half8,  vxc_short8, vxc_uchar16)
+SLICE_16BITS_TO_2D(F16, I16, vxc_half8,  vxc_short8, vxc_short8)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/tile.vx b/src/tim/vx/internal/src/libnnext/ops/vx/tile.vx
index 54fb828..7fd4c58 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/tile.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/tile.vx
@@ -130,4 +130,41 @@ TILE_2D(I16, I16, 6, 5, vxc_short8)
 TILE_2D(I16, I16, 7, 6, vxc_short8)
 TILE_2D(I16, I16, 0, 7, vxc_short8)
 
+#define TILE_2D_1TON(name0, name1, type) \
+__kernel void tile_1toN_##name0##to##name1##_2D( \
+    __read_only image2d_array_t  input, \
+    __write_only image2d_array_t output, \
+                             int batchIn, \
+                             int depthIn, \
+                             int depthOut, \
+                             int multiples_0, \
+                             int multiples_1, \
+                             int multiples_2, \
+                             int multiples_3 \
+) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+    int width = get_image_width(input); \
+    int height = get_image_height(input); \
+    int output_width = get_image_width(output); \
+    int output_height = get_image_height(output); \
+    type src; \
+    VXC_ReadImage(src, input, coord, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    do \
+    { \
+        do \
+        { \
+            VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            coord.x += 8; \
+        } while (coord.x < output_width); \
+        coord.x = 0; \
+        coord.y += height; \
+    } while (coord.y < output_height); \
+}
+TILE_2D_1TON(U8,  U8, vxc_uchar8)
+TILE_2D_1TON(I16, I16, vxc_short8)
+
+
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_axis_aligned_bbox_transform.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_axis_aligned_bbox_transform.vx
deleted file mode 100644
index b0def7f..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_axis_aligned_bbox_transform.vx
+++ /dev/null
@@ -1,8 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-__kernel void vxcAxis_aligned_bbox_transform(
-    __read_only image2d_array_t   input,
-    __write_only image2d_array_t  output)
-{
-
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_generate_proposals.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_generate_proposals.vx
deleted file mode 100644
index 9b2e37d..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_generate_proposals.vx
+++ /dev/null
@@ -1,8 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-__kernel void vxcGenerate_proposals(
-    __read_only image2d_array_t   input,
-    __write_only image2d_array_t  output)
-{
-
-}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx
index b0f9565..86d1c60 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx
@@ -9,6 +9,62 @@
  */
 #include "cl_viv_vx_ext.h"
 
+typedef struct Image
+{
+    __global uchar *ptr;
+    int             stride_x;
+    int             stride_y;
+} Image;
+
+inline uchar* get_image_ptr_from_coord(Image img, int2 coord)
+{
+    return img.ptr + coord.x * img.stride_x + coord.y * img.stride_y;
+}
+
+inline Image create_image_from_image2d(image2d_t input, int stride_x)
+{
+    int8 desc;
+    _viv_asm(COPY, desc, input, sizeof(desc));
+
+    Image img =
+    {
+        .ptr                           = (uchar*)desc.s0,
+        .stride_x                      = stride_x,
+        .stride_y                      = desc.s1
+    };
+
+    return img;
+}
+
+typedef struct Tensor
+{
+    __global uchar *ptr;
+    int             stride_x;
+    int             stride_y;
+    int             stride_z;
+} Tensor;
+
+inline uchar* create_tensor_ptr_from_coord(Tensor t, int4 coord)
+{
+    return t.ptr + coord.x * t.stride_x + coord.y * t.stride_y + coord.z * t.stride_z;
+}
+
+inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride_x)
+{
+    int8 desc;
+    _viv_asm(COPY, desc, input, sizeof(desc));
+
+    Tensor t =
+    {
+        .ptr                           = (uchar*)desc.s0,
+        .stride_x                      = stride_x,
+        .stride_y                      = desc.s1,
+        .stride_z                      = desc.s4
+    };
+
+    return t;
+}
+
 #if (VX_VERSION==1)
 #define VXC_DP2x8_b_(dst, src0, src1, src2, info, uniform)\
 do\
diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
index fd2db22..962644c 100644
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
@@ -1525,7 +1525,7 @@ _viv_uniform float output_scale;\n\
 _viv_uniform float output_zp;\n\
 \n\
 #define BATCH_NORM_SH_IMPL(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \\\n\
-__kernel void batch_norm_##name0##to##name1##_brdcst1( \\\n\
+__kernel void batch_norm_##name0##_F16_F16_F16_F32to##name1##_brdcst1( \\\n\
     __read_only  image2d_array_t input, \\\n\
     __read_only  image2d_array_t Mean, \\\n\
     __read_only  image2d_array_t Variance, \\\n\
@@ -1589,7 +1589,7 @@ BATCH_NORM_SH_IMPL(I8,  I8,  vxc_char16,  vxc_char16,  int4,  vxc_char16,  vxc_c
 BATCH_NORM_SH_IMPL(I8,  F16, vxc_char16,  vxc_char16,  half4, vxc_half8,   vxc_ushort8)\n\
 \n\
 #define BATCH_NORM_SH_IMPL_2D(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \\\n\
-__kernel void batch_norm_##name0##to##name1##_brdcst1_2D( \\\n\
+__kernel void batch_norm_##name0##_F16_F16_F16_F32to##name1##_brdcst1_2D( \\\n\
     __read_only  image2d_array_t input, \\\n\
     __read_only  image2d_t       Mean, \\\n\
     __read_only  image2d_t       Variance, \\\n\
@@ -1654,7 +1654,7 @@ BATCH_NORM_SH_IMPL_2D(I8,  F16, vxc_char16,  vxc_char16,  half4, vxc_half8,   vx
 \n\
 \n\
 #define BATCH_NORM_SH_IMPL_AXIS1(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \\\n\
-__kernel void batch_norm_##name0##to##name1##_brdcst0( \\\n\
+__kernel void batch_norm_##name0##_F16_F16_F16_F32to##name1##_brdcst0( \\\n\
     __read_only  image2d_array_t input, \\\n\
     __read_only  image2d_array_t Mean, \\\n\
     __read_only  image2d_array_t Variance, \\\n\
@@ -1721,7 +1721,7 @@ BATCH_NORM_SH_IMPL_AXIS1(I8,  I8,  vxc_char16,  vxc_char16,  int4,  vxc_char16,
 BATCH_NORM_SH_IMPL_AXIS1(I8,  F16, vxc_char16,  vxc_char16,  half4, vxc_half8,   vxc_ushort8)\n\
 \n\
 #define BATCH_NORM_SH_IMPL_AXIS1_2D(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \\\n\
-__kernel void batch_norm_##name0##to##name1##_brdcst0_2D( \\\n\
+__kernel void batch_norm_##name0##_F16_F16_F16_F32to##name1##_brdcst0_2D( \\\n\
     __read_only  image2d_array_t input, \\\n\
     __read_only  image2d_t       Mean, \\\n\
     __read_only  image2d_t       Variance, \\\n\
@@ -1788,6 +1788,275 @@ BATCH_NORM_SH_IMPL_AXIS1_2D(I8,  F16, vxc_char16,  vxc_char16,  half4, vxc_half8
 \n\
 "; /* end of batchnorm_single_vx*/
 
+static const char batchnorm_single_f32_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniDatatoF32_0_4x4;\n\
+_viv_uniform VXC_512Bits uniDatatoF32_1_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform float input_scale;\n\
+_viv_uniform float input_tail;\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
+\n\
+#define BATCH_NORM_SH_IMPL(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \\\n\
+__kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst1( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_array_t Mean, \\\n\
+    __read_only  image2d_array_t Variance, \\\n\
+    __read_only  image2d_array_t Gamma, \\\n\
+    __read_only  image2d_array_t Beta, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           eps \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    read_type vec; \\\n\
+    src_type src; \\\n\
+    VXC_ReadImage2DArray(vec, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src, vec, 16); \\\n\
+    vxc_ushort8 _mean, _var; \\\n\
+    vxc_half8 mean, var; \\\n\
+    VXC_ReadImage2DArray(_mean, Mean, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, mean, _mean, 16); \\\n\
+    VXC_ReadImage2DArray(_var, Variance, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, var, _var, 16); \\\n\
+    float4 gamma0 = read_imagef(Gamma, coord); \\\n\
+    coord.x += 4; \\\n\
+    float4 gamma1 = read_imagef(Gamma, coord); \\\n\
+    coord.x -= 4; \\\n\
+    float4 beta = read_imagef(Beta, coord); \\\n\
+ \\\n\
+    float4 src0, src1, m, v; \\\n\
+    VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\
+    VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\
+    VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\
+    gamma0 = gamma0 * rsqrt(v + eps); \\\n\
+    src0 = src0 * input_scale + input_tail; \\\n\
+    src0 = (src0 - m) * gamma0 + beta.xxxx; \\\n\
+    src0 = src0 * output_scale + output_zp; \\\n\
+    VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\
+    VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\
+    VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\
+    gamma1 = gamma1 * rsqrt(v + eps); \\\n\
+    src1 = src1 * input_scale + input_tail; \\\n\
+    src1 = (src1 - m) * gamma1 + beta.xxxx; \\\n\
+    src1 = src1 * output_scale + output_zp; \\\n\
+ \\\n\
+    conv_type dst0, dst1; \\\n\
+    _viv_asm(CONV_RTE, dst0, src0); \\\n\
+    _viv_asm(CONV_RTE, dst1, src1); \\\n\
+    dst_type tmp; \\\n\
+    save_type dst; \\\n\
+    VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, tmp, 16); \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+BATCH_NORM_SH_IMPL(F16, F16, vxc_half8,   vxc_ushort8, half4, vxc_half8,   vxc_ushort8)\n\
+BATCH_NORM_SH_IMPL(F16, I16, vxc_half8,   vxc_ushort8, int4,  vxc_short8,  vxc_short8)\n\
+BATCH_NORM_SH_IMPL(F16, U8,  vxc_half8,   vxc_ushort8, int4,  vxc_uchar16, vxc_uchar16)\n\
+BATCH_NORM_SH_IMPL(F16, I8,  vxc_half8,   vxc_ushort8, int4,  vxc_char16,  vxc_char16)\n\
+BATCH_NORM_SH_IMPL(I16, I16, vxc_short8,  vxc_short8,  int4,  vxc_short8,  vxc_short8)\n\
+BATCH_NORM_SH_IMPL(I16, F16, vxc_short8,  vxc_short8,  half4, vxc_half8,   vxc_ushort8)\n\
+BATCH_NORM_SH_IMPL(U8,  U8,  vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16, vxc_uchar16)\n\
+BATCH_NORM_SH_IMPL(U8,  F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8,   vxc_ushort8)\n\
+BATCH_NORM_SH_IMPL(I8,  I8,  vxc_char16,  vxc_char16,  int4,  vxc_char16,  vxc_char16)\n\
+BATCH_NORM_SH_IMPL(I8,  F16, vxc_char16,  vxc_char16,  half4, vxc_half8,   vxc_ushort8)\n\
+\n\
+#define BATCH_NORM_SH_IMPL_2D(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \\\n\
+__kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst1_2D( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       Mean, \\\n\
+    __read_only  image2d_t       Variance, \\\n\
+    __read_only  image2d_t       Gamma, \\\n\
+    __read_only  image2d_t       Beta, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           eps \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    read_type vec; \\\n\
+    src_type src; \\\n\
+    VXC_ReadImage(vec, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src, vec, 16); \\\n\
+    coord.z = coord.x + 4; \\\n\
+    vxc_ushort8 _mean, _var; \\\n\
+    vxc_half8 mean, var; \\\n\
+    VXC_ReadImage(_mean, Mean, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, mean, _mean, 16); \\\n\
+    VXC_ReadImage(_var, Variance, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, var, _var, 16); \\\n\
+    float4 gamma0 = read_imagef(Gamma, coord.xy); \\\n\
+    float4 gamma1 = read_imagef(Gamma, coord.zy); \\\n\
+    float4 beta = read_imagef(Beta, coord.xy); \\\n\
+ \\\n\
+    float4 src0, src1, m, v; \\\n\
+    VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\
+    VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\
+    VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\
+    gamma0 = gamma0 * rsqrt(v + eps); \\\n\
+    src0 = src0 * input_scale + input_tail; \\\n\
+    src0 = (src0 - m) * gamma0 + beta.xxxx; \\\n\
+    src0 = src0 * output_scale + output_zp; \\\n\
+    VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\
+    VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\
+    VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\
+    gamma1 = gamma1 * rsqrt(v + eps); \\\n\
+    src1 = src1 * input_scale + input_tail; \\\n\
+    src1 = (src1 - m) * gamma1 + beta.xxxx; \\\n\
+    src1 = src1 * output_scale + output_zp; \\\n\
+ \\\n\
+    conv_type dst0, dst1; \\\n\
+    _viv_asm(CONV_RTE, dst0, src0); \\\n\
+    _viv_asm(CONV_RTE, dst1, src1); \\\n\
+    dst_type tmp; \\\n\
+    save_type dst; \\\n\
+    VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, tmp, 16); \\\n\
+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+BATCH_NORM_SH_IMPL_2D(F16, F16, vxc_half8,   vxc_ushort8, half4, vxc_half8,   vxc_ushort8)\n\
+BATCH_NORM_SH_IMPL_2D(F16, I16, vxc_half8,   vxc_ushort8, int4,  vxc_short8,  vxc_short8)\n\
+BATCH_NORM_SH_IMPL_2D(F16, U8,  vxc_half8,   vxc_ushort8, int4,  vxc_uchar16, vxc_uchar16)\n\
+BATCH_NORM_SH_IMPL_2D(F16, I8,  vxc_half8,   vxc_ushort8, int4,  vxc_char16,  vxc_char16)\n\
+BATCH_NORM_SH_IMPL_2D(I16, I16, vxc_short8,  vxc_short8,  int4,  vxc_short8,  vxc_short8)\n\
+BATCH_NORM_SH_IMPL_2D(I16, F16, vxc_short8,  vxc_short8,  half4, vxc_half8,   vxc_ushort8)\n\
+BATCH_NORM_SH_IMPL_2D(U8,  U8,  vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16, vxc_uchar16)\n\
+BATCH_NORM_SH_IMPL_2D(U8,  F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8,   vxc_ushort8)\n\
+BATCH_NORM_SH_IMPL_2D(I8,  I8,  vxc_char16,  vxc_char16,  int4,  vxc_char16,  vxc_char16)\n\
+BATCH_NORM_SH_IMPL_2D(I8,  F16, vxc_char16,  vxc_char16,  half4, vxc_half8,   vxc_ushort8)\n\
+\n\
+\n\
+#define BATCH_NORM_SH_IMPL_AXIS1(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \\\n\
+__kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst0( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_array_t Mean, \\\n\
+    __read_only  image2d_array_t Variance, \\\n\
+    __read_only  image2d_array_t Gamma, \\\n\
+    __read_only  image2d_array_t Beta, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           eps \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    read_type vec; \\\n\
+    src_type src; \\\n\
+    VXC_ReadImage2DArray(vec, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src, vec, 16); \\\n\
+    vxc_ushort8 _mean, _var; \\\n\
+    vxc_half8 mean, var; \\\n\
+    VXC_ReadImage2DArray(_mean, Mean, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, mean, _mean, 16); \\\n\
+    VXC_ReadImage2DArray(_var, Variance, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, var, _var, 16); \\\n\
+    float4 gamma0 = read_imagef(Gamma, coord); \\\n\
+    float4 beta0 = read_imagef(Beta, coord); \\\n\
+    coord.x += 4; \\\n\
+    float4 gamma1 = read_imagef(Gamma, coord); \\\n\
+    float4 beta1 = read_imagef(Beta, coord); \\\n\
+    coord.x -= 4; \\\n\
+ \\\n\
+    float4 src0, src1, m, v; \\\n\
+    VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\
+    VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\
+    VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\
+    gamma0 = gamma0 * rsqrt(v + eps); \\\n\
+    src0 = src0 * input_scale + input_tail; \\\n\
+    src0 = (src0 - m) * gamma0 + beta0; \\\n\
+    src0 = src0 * output_scale + output_zp; \\\n\
+    VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\
+    VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\
+    VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\
+    gamma1 = gamma1 * rsqrt(v + eps); \\\n\
+    src1 = src1 * input_scale + input_tail; \\\n\
+    src1 = (src1 - m) * gamma1 + beta1; \\\n\
+    src1 = src1 * output_scale + output_zp; \\\n\
+ \\\n\
+    conv_type dst0, dst1; \\\n\
+    _viv_asm(CONV_RTE, dst0, src0); \\\n\
+    _viv_asm(CONV_RTE, dst1, src1); \\\n\
+    dst_type tmp; \\\n\
+    save_type dst; \\\n\
+    VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, tmp, 16); \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+BATCH_NORM_SH_IMPL_AXIS1(F16, F16, vxc_half8,   vxc_ushort8, half4, vxc_half8,   vxc_ushort8)\n\
+BATCH_NORM_SH_IMPL_AXIS1(F16, I16, vxc_half8,   vxc_ushort8, int4,  vxc_short8,  vxc_short8)\n\
+BATCH_NORM_SH_IMPL_AXIS1(F16, U8,  vxc_half8,   vxc_ushort8, int4,  vxc_uchar16, vxc_uchar16)\n\
+BATCH_NORM_SH_IMPL_AXIS1(F16, I8,  vxc_half8,   vxc_ushort8, int4,  vxc_char16,  vxc_char16)\n\
+BATCH_NORM_SH_IMPL_AXIS1(I16, I16, vxc_short8,  vxc_short8,  int4,  vxc_short8,  vxc_short8)\n\
+BATCH_NORM_SH_IMPL_AXIS1(I16, F16, vxc_short8,  vxc_short8,  half4, vxc_half8,   vxc_ushort8)\n\
+BATCH_NORM_SH_IMPL_AXIS1(U8,  U8,  vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16, vxc_uchar16)\n\
+BATCH_NORM_SH_IMPL_AXIS1(U8,  F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8,   vxc_ushort8)\n\
+BATCH_NORM_SH_IMPL_AXIS1(I8,  I8,  vxc_char16,  vxc_char16,  int4,  vxc_char16,  vxc_char16)\n\
+BATCH_NORM_SH_IMPL_AXIS1(I8,  F16, vxc_char16,  vxc_char16,  half4, vxc_half8,   vxc_ushort8)\n\
+\n\
+#define BATCH_NORM_SH_IMPL_AXIS1_2D(name0, name1, src_type, read_type, conv_type, dst_type, save_type) \\\n\
+__kernel void batch_norm_##name0##_F16_F16_F32_F32to##name1##_brdcst0_2D( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       Mean, \\\n\
+    __read_only  image2d_t       Variance, \\\n\
+    __read_only  image2d_t       Gamma, \\\n\
+    __read_only  image2d_t       Beta, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           eps \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\
+    read_type vec; \\\n\
+    src_type src; \\\n\
+    VXC_ReadImage(vec, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src, vec, 16); \\\n\
+    coord.z += 4; \\\n\
+    vxc_ushort8 _mean, _var; \\\n\
+    vxc_half8 mean, var; \\\n\
+    VXC_ReadImage(_mean, Mean, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, mean, _mean, 16); \\\n\
+    VXC_ReadImage(_var, Variance, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, var, _var, 16); \\\n\
+    float4 gamma0 = read_imagef(Gamma, coord.xy); \\\n\
+    float4 gamma1 = read_imagef(Gamma, coord.zy); \\\n\
+    float4 beta0 = read_imagef(Beta, coord.xy); \\\n\
+    float4 beta1 = read_imagef(Beta, coord.zy); \\\n\
+ \\\n\
+    float4 src0, src1, m, v; \\\n\
+    VXC_DP4x4(src0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\
+    VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\
+    VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_0_4x4); \\\n\
+    gamma0 = gamma0 * rsqrt(v + eps); \\\n\
+    src0 = src0 * input_scale + input_tail; \\\n\
+    src0 = (src0 - m) * gamma0 + beta0; \\\n\
+    src0 = src0 * output_scale + output_zp; \\\n\
+    VXC_DP4x4(src1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\
+    VXC_DP4x4(m, mean, mean, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\
+    VXC_DP4x4(v, var, var, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniDatatoF32_1_4x4); \\\n\
+    gamma1 = gamma1 * rsqrt(v + eps); \\\n\
+    src1 = src1 * input_scale + input_tail; \\\n\
+    src1 = (src1 - m) * gamma1 + beta1; \\\n\
+    src1 = src1 * output_scale + output_zp; \\\n\
+ \\\n\
+    conv_type dst0, dst1; \\\n\
+    _viv_asm(CONV_RTE, dst0, src0); \\\n\
+    _viv_asm(CONV_RTE, dst1, src1); \\\n\
+    dst_type tmp; \\\n\
+    save_type dst; \\\n\
+    VXC_DP2x8(tmp, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, tmp, 16); \\\n\
+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+BATCH_NORM_SH_IMPL_AXIS1_2D(F16, F16, vxc_half8,   vxc_ushort8, half4, vxc_half8,   vxc_ushort8)\n\
+BATCH_NORM_SH_IMPL_AXIS1_2D(F16, I16, vxc_half8,   vxc_ushort8, int4,  vxc_short8,  vxc_short8)\n\
+BATCH_NORM_SH_IMPL_AXIS1_2D(F16, U8,  vxc_half8,   vxc_ushort8, int4,  vxc_uchar16, vxc_uchar16)\n\
+BATCH_NORM_SH_IMPL_AXIS1_2D(F16, I8,  vxc_half8,   vxc_ushort8, int4,  vxc_char16,  vxc_char16)\n\
+BATCH_NORM_SH_IMPL_AXIS1_2D(I16, I16, vxc_short8,  vxc_short8,  int4,  vxc_short8,  vxc_short8)\n\
+BATCH_NORM_SH_IMPL_AXIS1_2D(I16, F16, vxc_short8,  vxc_short8,  half4, vxc_half8,   vxc_ushort8)\n\
+BATCH_NORM_SH_IMPL_AXIS1_2D(U8,  U8,  vxc_uchar16, vxc_uchar16, int4,  vxc_uchar16, vxc_uchar16)\n\
+BATCH_NORM_SH_IMPL_AXIS1_2D(U8,  F16, vxc_uchar16, vxc_uchar16, half4, vxc_half8,   vxc_ushort8)\n\
+BATCH_NORM_SH_IMPL_AXIS1_2D(I8,  I8,  vxc_char16,  vxc_char16,  int4,  vxc_char16,  vxc_char16)\n\
+BATCH_NORM_SH_IMPL_AXIS1_2D(I8,  F16, vxc_char16,  vxc_char16,  half4, vxc_half8,   vxc_ushort8)\n\
+\n\
+"; /* end of batchnorm_single_f32_vx*/
+
 static const char cast_vx[] = "\n\
 #include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -2300,25 +2569,350 @@ __kernel void clip_U8toF16_2D(\n\
 }\n\
 "; /* end of clip_U8_vx*/
 
+static const char conv1d_ovxlib_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniConv1DK3_Lo0_4x4;\n\
+_viv_uniform VXC_512Bits uniConv1DK3_Lo1_4x4;\n\
+_viv_uniform VXC_512Bits uniConv1DK3_Lo2_4x4;\n\
+_viv_uniform VXC_512Bits uniConv1DK3_Hi0_4x4;\n\
+_viv_uniform VXC_512Bits uniConv1DK3_Hi1_4x4;\n\
+_viv_uniform VXC_512Bits uniConv1DK3_Hi2_4x4;\n\
+_viv_uniform VXC_512Bits uniDataConvK3_2x8;\n\
+_viv_uniform VXC_512Bits uniSumOrderUchar_2x8;\n\
+\n\
+_viv_uniform int input_ZP;\n\
+_viv_uniform int weight_ZP;\n\
+_viv_uniform float output_ZP;\n\
+_viv_uniform float scaleOut;\n\
+_viv_uniform int input_height;\n\
+\n\
+__kernel void conv1d_U8U8I32toU8_K3_S1(\n\
+     __read_only image2d_array_t   input,\n\
+     __read_only image2d_array_t   weight,\n\
+     __read_only image2d_t         bias,\n\
+    __write_only image2d_array_t   output,\n\
+                 int               stride,\n\
+                 int               pad_front,\n\
+                 int               pad_end,\n\
+                 int               dilation,\n\
+                 int               overflow_policy)\n\
+{\n\
+    int4 coord   = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_w = (int4)(0, 0, get_global_id(1), 0);\n\
+    float4 sum0, sum1, dst;\n\
+    vxc_short8 weight_val_s =(short)input_ZP;\n\
+    vxc_uchar16 input_val = 0, weight_val = 0;\n\
+    int temp = 0, i;\n\
+\n\
+    temp = read_imagei(bias, coord.yz).x;\n\
+    sum0 = convert_float(temp);\n\
+    sum1 = sum0;\n\
+    weight_val_s.s5 = (short)weight_ZP;\n\
+\n\
+    for (i = 0; i < input_height; i++)\n\
+    {\n\
+        VXC_ReadImage2DArray(weight_val, weight, coord_w, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                             VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                             VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP2x8(weight_val_s, weight_val, weight_val_s, \\\n\
+                             VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0), uniDataConvK3_2x8);\n\
+\n\
+        VXC_DP4x4(dst, input_val, weight_val_s, \\\n\
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo0_4x4);\n\
+        sum0 += dst;\n\
+        VXC_DP4x4(dst, input_val, weight_val_s, \\\n\
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi0_4x4);\n\
+        sum1 += dst;\n\
+        coord.x += dilation;\n\
+        VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP4x4(dst, input_val, weight_val_s, \\\n\
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo1_4x4);\n\
+        sum0 += dst;\n\
+        VXC_DP4x4(dst, input_val, weight_val_s, \\\n\
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi1_4x4);\n\
+        sum1 += dst;\n\
+        coord.x += dilation;\n\
+        VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP4x4(dst, input_val, weight_val_s, \\\n\
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo2_4x4);\n\
+        sum0 += dst;\n\
+        VXC_DP4x4(dst, input_val, weight_val_s, \\\n\
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi2_4x4);\n\
+        sum1 += dst;\n\
+        coord_w.y++;\n\
+        coord.z++;\n\
+        coord.x = get_global_id(0);\n\
+    }\n\
+\n\
+    sum0 = sum0 * scaleOut + output_ZP;\n\
+    sum1 = sum1 * scaleOut + output_ZP;\n\
+    uchar4 result0, result1;\n\
+    _viv_asm(CONV_SAT_RTE, result0, sum0);\n\
+    _viv_asm(CONV_SAT_RTE, result1, sum1);\n\
+    vxc_uchar8 result;\n\
+    VXC_DP2x8(result, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumOrderUchar_2x8);\n\
+    VXC_WriteImage(output, coord.xy, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void conv1d_U8U8I32toU8_K3_S1_D2_D4(\n\
+     __read_only image2d_array_t   input,\n\
+     __read_only image2d_array_t   weight,\n\
+     __read_only image2d_t         bias,\n\
+    __write_only image2d_array_t   output,\n\
+                 int               stride,\n\
+                 int               pad_front,\n\
+                 int               pad_end,\n\
+                 int               dilation,\n\
+                 int               overflow_policy)\n\
+{\n\
+    int4 coord   = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_w = (int4)(0, 0, get_global_id(1), 0);\n\
+    float4 sum0, sum1, dst;\n\
+    vxc_short8 weight_val_s =(short)input_ZP;\n\
+    vxc_uchar16 input_val = 0, weight_val = 0;\n\
+    int temp = 0, i;\n\
+\n\
+    temp = read_imagei(bias, coord.yz).x;\n\
+    sum0 = convert_float(temp);\n\
+    sum1 = sum0;\n\
+    weight_val_s.s5 = (short)weight_ZP;\n\
+\n\
+    for (i = 0; i < input_height; i++)\n\
+    {\n\
+        VXC_ReadImage2DArray(weight_val, weight, coord_w, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                             VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP2x8(weight_val_s, weight_val, weight_val_s, \\\n\
+                             VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0), uniDataConvK3_2x8);\n\
+\n\
+        VXC_DP4x4(dst, input_val, weight_val_s, \\\n\
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo0_4x4);\n\
+        sum0 += dst;\n\
+        VXC_DP4x4(dst, input_val, weight_val_s, \\\n\
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi0_4x4);\n\
+        sum1 += dst;\n\
+        VXC_DP4x4(dst, input_val, weight_val_s, \\\n\
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo1_4x4);\n\
+        sum0 += dst;\n\
+        VXC_DP4x4(dst, input_val, weight_val_s, \\\n\
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi1_4x4);\n\
+        sum1 += dst;\n\
+        VXC_DP4x4(dst, input_val, weight_val_s, \\\n\
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Lo2_4x4);\n\
+        sum0 += dst;\n\
+        VXC_DP4x4(dst, input_val, weight_val_s, \\\n\
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConv1DK3_Hi2_4x4);\n\
+        sum1 += dst;\n\
+        coord_w.y++;\n\
+        coord.z++;\n\
+    }\n\
+\n\
+    sum0 = sum0 * scaleOut + output_ZP;\n\
+    sum1 = sum1 * scaleOut + output_ZP;\n\
+    uchar4 result0, result1;\n\
+    _viv_asm(CONV_SAT_RTE, result0, sum0);\n\
+    _viv_asm(CONV_SAT_RTE, result1, sum1);\n\
+    vxc_uchar8 result;\n\
+    VXC_DP2x8(result, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumOrderUchar_2x8);\n\
+    VXC_WriteImage(output, coord.xy, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of conv1d_ovxlib_vx*/
+
+static const char conv1d_ovxlib_k1024_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniU8SubZp_lo_2x8;\n\
+_viv_uniform VXC_512Bits uniU8SubZp_hi_2x8;\n\
+_viv_uniform VXC_512Bits uniU8Conv1d_part0_8x2;\n\
+_viv_uniform VXC_512Bits uniU8Conv1d_part1_8x2;\n\
+_viv_uniform VXC_512Bits uniU8Conv1d_part2_8x2;\n\
+_viv_uniform VXC_512Bits uniU8Conv1d_part3_8x2;\n\
+_viv_uniform VXC_512Bits uniSumOrderUchar_2x8;\n\
+\n\
+_viv_uniform int kernel_cnt_x16;\n\
+_viv_uniform int weight_ZP;\n\
+_viv_uniform float output_ZP;\n\
+_viv_uniform float scaleOut;\n\
+_viv_uniform int input_height;\n\
+_viv_uniform int input_width;\n\
+_viv_uniform int output_width;\n\
+\n\
+__kernel void conv1d_U8U8I32toU8_K1024_SMALL(\n\
+     __read_only image2d_array_t   input,\n\
+     __read_only image2d_array_t   weight,\n\
+     __read_only image2d_t         bias,\n\
+    __write_only image2d_array_t   output,\n\
+                 int               stride,\n\
+                 int               pad_front,\n\
+                 int               pad_end,\n\
+                 int               dilation,\n\
+                 int               overflow_policy)\n\
+{\n\
+    int  start_x = get_global_id(0) - pad_front;\n\
+    int4 coord   = (int4)(start_x, get_global_id(1), 0, get_global_id(0));\n\
+    int4 coord_w = (int4)(0, 0, get_global_id(1), 0);\n\
+    float4 sum0, sum1, dst;\n\
+    vxc_short8 coef;\n\
+    vxc_short8 w_zp = (short)weight_ZP;\n\
+    vxc_uchar16 input_val = 0, weight_val = 0;\n\
+    int temp = 0, i, j;\n\
+\n\
+    temp = read_imagei(bias, coord.yz).x;\n\
+    sum0 = convert_float(temp);\n\
+    sum1 = sum0;\n\
+\n\
+    for (i = 0; i < input_height; i++)\n\
+    {\n\
+        for (j = 0; j < kernel_cnt_x16; j++)\n\
+        {\n\
+            VXC_ReadImage2DArray(weight_val, weight, coord_w, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                                 VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                                 VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_DP2x8(coef, weight_val, w_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part0_8x2);\n\
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part1_8x2);\n\
+            sum0 += dst;\n\
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part2_8x2);\n\
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part3_8x2);\n\
+            sum1 += dst;\n\
+            VXC_ReadImage(input_val, input, coord.xz, VXC_5BITOFFSET_XY(8, 0), \\\n\
+                                 VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_DP2x8(coef, weight_val, w_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part0_8x2);\n\
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part1_8x2);\n\
+            sum0 += dst;\n\
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part2_8x2);\n\
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part3_8x2);\n\
+            sum1 += dst;\n\
+            coord_w.x += 16;\n\
+            coord.x += 16;\n\
+        }\n\
+        coord_w.x = 0;\n\
+        coord_w.y++;\n\
+        coord.z++;\n\
+        coord.x = start_x;\n\
+    }\n\
+\n\
+    sum0 = sum0 * scaleOut + output_ZP;\n\
+    sum1 = sum1 * scaleOut + output_ZP;\n\
+    uchar4 result0, result1;\n\
+    _viv_asm(CONV_SAT_RTE, result0, sum0);\n\
+    _viv_asm(CONV_SAT_RTE, result1, sum1);\n\
+    vxc_uchar8 result;\n\
+    VXC_DP2x8(result, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumOrderUchar_2x8);\n\
+    VXC_WriteImage(output, coord.wy, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+inline uchar* get_image2D_array_ptr(image2d_array_t  input)\n\
+{\n\
+    int8 desc;\n\
+    _viv_asm(COPY, desc, input, sizeof(desc));\n\
+    uchar *src_ptr = (uchar*)desc.s0;\n\
+    return src_ptr;\n\
+}\n\
+\n\
+__kernel void conv1d_U8U8I32toU8_K1024_LARGE(\n\
+     __read_only image2d_array_t   input,\n\
+     __read_only image2d_array_t   weight,\n\
+     __read_only image2d_t         bias,\n\
+    __write_only image2d_array_t   output,\n\
+                 int               stride,\n\
+                 int               pad_front,\n\
+                 int               pad_end,\n\
+                 int               dilation,\n\
+                 int               overflow_policy)\n\
+{\n\
+    int  start_x = get_global_id(0);\n\
+    int  w_left  = output_width - start_x;\n\
+    int  out_x   = w_left < 8 ? get_global_id(0) - w_left : get_global_id(0);\n\
+    int4 coord   = (int4)(start_x, get_global_id(1), 0, out_x);\n\
+    int4 coord_w = (int4)(0, 0, get_global_id(1), 0);\n\
+    float4 sum0, sum1, dst;\n\
+    vxc_short8 coef;\n\
+    vxc_short8 w_zp = (short)weight_ZP;\n\
+    vxc_uchar16 input_val = 0, weight_val = 0;\n\
+    int temp = 0, i, j;\n\
+    uchar *src_ptr_base = (uchar *)get_image2D_array_ptr(input);\n\
+    uchar *src_ptr;\n\
+    uchar *dst_ptr = (uchar *)get_image2D_array_ptr(output);\n\
+\n\
+    temp = read_imagei(bias, coord.yz).x;\n\
+    sum0 = convert_float(temp);\n\
+    sum1 = sum0;\n\
+\n\
+    for (i = 0; i < input_height; i++)\n\
+    {\n\
+        src_ptr = src_ptr_base + (coord.x + coord.z * input_width);\n\
+        for (j = 0; j < kernel_cnt_x16; j++)\n\
+        {\n\
+            VXC_ReadImage2DArray(weight_val, weight, coord_w, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                                 VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_Vload16(input_val, src_ptr, 0);\n\
+            VXC_DP2x8(coef, weight_val, w_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_lo_2x8);\n\
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part0_8x2);\n\
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part1_8x2);\n\
+            sum0 += dst;\n\
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part2_8x2);\n\
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part3_8x2);\n\
+            sum1 += dst;\n\
+            src_ptr += 8;\n\
+            VXC_Vload16(input_val, src_ptr, 0);\n\
+            VXC_DP2x8(coef, weight_val, w_zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniU8SubZp_hi_2x8);\n\
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part0_8x2);\n\
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part1_8x2);\n\
+            sum0 += dst;\n\
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part2_8x2);\n\
+            VXC_DP8x2(dst, input_val, coef, VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0), uniU8Conv1d_part3_8x2);\n\
+            sum1 += dst;\n\
+            coord_w.x += 16;\n\
+            coord.x += 16;\n\
+            src_ptr += 8;\n\
+        }\n\
+        coord_w.x = 0;\n\
+        coord_w.y++;\n\
+        coord.z++;\n\
+        coord.x = start_x;\n\
+    }\n\
+\n\
+    sum0 = sum0 * scaleOut + output_ZP;\n\
+    sum1 = sum1 * scaleOut + output_ZP;\n\
+    uchar4 result0, result1;\n\
+    _viv_asm(CONV_SAT_RTE, result0, sum0);\n\
+    _viv_asm(CONV_SAT_RTE, result1, sum1);\n\
+    vxc_uchar8 result;\n\
+    VXC_DP2x8(result, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumOrderUchar_2x8);\n\
+    dst_ptr = dst_ptr + (coord.w + coord.y * output_width);\n\
+    VXC_Vstore8(dst_ptr, 0, result);\n\
+}\n\
+\n\
+"; /* end of conv1d_ovxlib_k1024_vx*/
+
 static const char depth2space_crd_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\
 _viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_ExLo_2x8;\n\
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_ExHi_2x8;\n\
+_viv_uniform VXC_512Bits uniDepth2SpaceF16Blk2_lo_2x8;\n\
+_viv_uniform VXC_512Bits uniDepth2SpaceF16Blk2_hi_2x8;\n\
+\n\
 \n\
 #define DEPTH2SPACE_CRD_QINT_TO_QINT(src0_type_name, src1_type_name, read_type, write_type) \\\n\
 __kernel void depth2space_crd_##src0_type_name##to##src1_type_name( \\\n\
-    image2d_array_t input, \\\n\
-    image2d_array_t output, \\\n\
-    int block_size \\\n\
-    ) \\\n\
+    image2d_array_t input, image2d_array_t output, int block_size) \\\n\
 { \\\n\
     int gidx = get_global_id(0); \\\n\
     int gidy = get_global_id(1); \\\n\
     int gidz = get_global_id(2); \\\n\
     int4 coord_out = (int4)(gidx, gidy, gidz, 0); \\\n\
     int block_e2 = block_size * block_size; \\\n\
-    int inx = gidx / block_size; \\\n\
-    int iny = gidy / block_size; \\\n\
+    ushort blk = (ushort)block_size; \\\n\
+    int inx = (int)((ushort)gidx / blk); \\\n\
+    int iny = (int)((ushort)gidy / blk); \\\n\
     int inz = (gidx  % block_size) + (gidy % block_size) * block_size + gidz * block_e2; \\\n\
     int4 coord_in = (int4)(inx, iny, inz, 0); \\\n\
     read_type src; \\\n\
@@ -2335,18 +2929,16 @@ DEPTH2SPACE_CRD_QINT_TO_QINT(I8, I8, vxc_char16, vxc_char16)\n\
 DEPTH2SPACE_CRD_QINT_TO_QINT(I16, I16, vxc_short8, vxc_short8)\n\
 \n\
 __kernel void depth2space_crd_F16toF16(\n\
-    image2d_array_t input,\n\
-    image2d_array_t output,\n\
-    int block_size\n\
-    )\n\
+    image2d_array_t input, image2d_array_t output, int block_size)\n\
 {\n\
     int gidx = get_global_id(0);\n\
     int gidy = get_global_id(1);\n\
     int gidz = get_global_id(2);\n\
     int4 coord_out = (int4)(gidx, gidy, gidz, 0);\n\
     int block_e2 = block_size * block_size;\n\
-    int inx = gidx / block_size;\n\
-    int iny = gidy / block_size;\n\
+    ushort blk = (ushort)block_size;\n\
+    int inx = (int)((ushort)gidx / blk);\n\
+    int iny = (int)((ushort)gidy / blk);\n\
     int inz = (gidx  % block_size) + (gidy % block_size) * block_size + gidz * block_e2;\n\
     int4 coord_in = (int4)(inx, iny, inz, 0);\n\
     vxc_short8 data;\n\
@@ -2356,18 +2948,16 @@ __kernel void depth2space_crd_F16toF16(\n\
 \n\
 #define DEPTH2SPACE_CRD_QINT_TO_F16(src0_type_name, read_type) \\\n\
 __kernel void depth2space_crd_##src0_type_name##toF16( \\\n\
-    image2d_array_t input, \\\n\
-    image2d_array_t output, \\\n\
-    int block_size \\\n\
-    ) \\\n\
+    image2d_array_t input, image2d_array_t output, int block_size) \\\n\
 { \\\n\
     int gidx = get_global_id(0); \\\n\
     int gidy = get_global_id(1); \\\n\
     int gidz = get_global_id(2); \\\n\
     int4 coord_out = (int4)(gidx, gidy, gidz, 0); \\\n\
     int block_e2 = block_size * block_size; \\\n\
-    int inx = gidx / block_size; \\\n\
-    int iny = gidy / block_size; \\\n\
+    ushort blk = (ushort)block_size; \\\n\
+    int inx = (int)((ushort)gidx / blk); \\\n\
+    int iny = (int)((ushort)gidy / blk); \\\n\
     int inz = (gidx  % block_size) + (gidy % block_size) * block_size + gidz * block_e2; \\\n\
     int4 coord_in = (int4)(inx, iny, inz, 0); \\\n\
     read_type src; \\\n\
@@ -2387,18 +2977,16 @@ DEPTH2SPACE_CRD_QINT_TO_F16(I16, vxc_short8)\n\
 \n\
 #define DEPTH2SPACE_CRD_F16_TO_QINT(src1_type_name, write_type) \\\n\
 __kernel void depth2space_crd_F16to##src1_type_name( \\\n\
-    image2d_array_t input, \\\n\
-    image2d_array_t output, \\\n\
-    int block_size \\\n\
-    ) \\\n\
+    image2d_array_t input, image2d_array_t output, int block_size) \\\n\
 { \\\n\
     int gidx = get_global_id(0); \\\n\
     int gidy = get_global_id(1); \\\n\
     int gidz = get_global_id(2); \\\n\
     int4 coord_out = (int4)(gidx, gidy, gidz, 0); \\\n\
     int block_e2 = block_size * block_size; \\\n\
-    int inx = gidx / block_size; \\\n\
-    int iny = gidy / block_size; \\\n\
+    ushort blk = (ushort)block_size; \\\n\
+    int inx = (int)((ushort)gidx / blk); \\\n\
+    int iny = (int)((ushort)gidy / blk); \\\n\
     int inz = (gidx  % block_size) + (gidy % block_size) * block_size + gidz * block_e2; \\\n\
     int4 coord_in = (int4)(inx, iny, inz, 0); \\\n\
     vxc_short8 src; \\\n\
@@ -2414,7 +3002,202 @@ __kernel void depth2space_crd_F16to##src1_type_name( \\\n\
 }\n\
 DEPTH2SPACE_CRD_F16_TO_QINT(U8, vxc_uchar16)\n\
 DEPTH2SPACE_CRD_F16_TO_QINT(I8, vxc_char16)\n\
-DEPTH2SPACE_CRD_F16_TO_QINT(I16, vxc_short8)"; /* end of depth2space_crd_vx*/
+DEPTH2SPACE_CRD_F16_TO_QINT(I16, vxc_short8)\n\
+\n\
+#define DEPTH2SPACE_CRD_QINT_TO_QINT_BLK2(src0_type_name, src1_type_name, read_type, write_type) \\\n\
+__kernel void depth2space_crd_##src0_type_name##to##src1_type_name##_blk2( \\\n\
+    image2d_array_t input, image2d_array_t output, int block_size) \\\n\
+{ \\\n\
+    int gidy = get_global_id(1); \\\n\
+    int gidz = get_global_id(2); \\\n\
+    int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz); \\\n\
+    int4 coord_in = coord_out >> 1; \\\n\
+    coord_in.z = (gidy & 1) * 2 + gidz * 4; \\\n\
+    coord_in.w = coord_in.z + 1; \\\n\
+    read_type src; \\\n\
+    VXC_ReadImage2DArray(src, input, coord_in.xyzz, \\\n\
+                VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(src, input, coord_in.xyww, \\\n\
+                VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    write_type  dst; \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    VXC_DP2x8(dst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_ExLo_2x8); \\\n\
+    VXC_DP2x8(dst, src, ms0, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_ExHi_2x8); \\\n\
+    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+DEPTH2SPACE_CRD_QINT_TO_QINT_BLK2(U8, U8, vxc_uchar16, vxc_uchar16)\n\
+DEPTH2SPACE_CRD_QINT_TO_QINT_BLK2(I8, I8, vxc_char16, vxc_char16)\n\
+\n\
+__kernel void depth2space_crd_F16toF16_blk2(\n\
+    image2d_array_t input, image2d_array_t output, int block_size)\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz);\n\
+    int4 coord_in = coord_out >> 1;\n\
+    coord_in.z = (gidy & 1) * 2 + gidz * 4;\n\
+    coord_in.w = coord_in.z + 1;\n\
+    vxc_short8 data0, data1, dst0, dst1;\n\
+    VXC_ReadImage2DArray(data0, input, coord_in.xyzz,\n\
+             VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(data1, input, coord_in.xyww,\n\
+             VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_DP2x8(dst0, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_lo_2x8);\n\
+    VXC_DP2x8(dst1, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_hi_2x8);\n\
+\n\
+    VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    coord_out.x += 8;\n\
+    VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void depth2space_crd_I16toI16_blk2(\n\
+    image2d_array_t input, image2d_array_t output, int block_size)\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz);\n\
+    int4 coord_in = coord_out >> 1;\n\
+    coord_in.z = (gidy & 1) * 2 + gidz * 4;\n\
+    coord_in.w = coord_in.z + 1;\n\
+    vxc_short8 src0, src1, data0, data1, dst0, dst1;\n\
+    VXC_ReadImage2DArray(src0, input, coord_in.xyzz,\n\
+             VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src1, input, coord_in.xyww,\n\
+             VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_DP2x8(data0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_lo_2x8);\n\
+    VXC_DP2x8(data1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_hi_2x8);\n\
+\n\
+    vxc_ushort8 ms0;\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\
+    VXC_DP2x8(dst0, data0, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8);\n\
+    VXC_DP2x8(dst1, data1, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8);\n\
+\n\
+    VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    coord_out.x += 8;\n\
+    VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+#define DEPTH2SPACE_CRD_QINT_TO_F16_BLK2(src0_type_name, read_type) \\\n\
+__kernel void depth2space_crd_##src0_type_name##toF16_blk2( \\\n\
+    image2d_array_t input, image2d_array_t output, int block_size) \\\n\
+{ \\\n\
+    int gidy = get_global_id(1); \\\n\
+    int gidz = get_global_id(2); \\\n\
+    int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz); \\\n\
+    int4 coord_in = coord_out >> 1; \\\n\
+    coord_in.z = (gidy & 1) * 2 + gidz * 4; \\\n\
+    coord_in.w = coord_in.z + 1; \\\n\
+    read_type src; \\\n\
+    VXC_ReadImage2DArray(src, input, coord_in.xyzz, \\\n\
+                VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(src, input, coord_in.xyww, \\\n\
+                VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    vxc_half8  tmpDst0, tmpDst1; \\\n\
+    vxc_short8  dst0, dst1; \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    VXC_DP2x8(tmpDst0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_ExLo_2x8); \\\n\
+    VXC_DP2x8(tmpDst1, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_ExHi_2x8); \\\n\
+    _viv_asm(COPY, dst0, tmpDst0, 16); \\\n\
+    _viv_asm(COPY, dst1, tmpDst1, 16); \\\n\
+    VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_out.x+=8; \\\n\
+    VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+DEPTH2SPACE_CRD_QINT_TO_F16_BLK2(U8, vxc_uchar16)\n\
+DEPTH2SPACE_CRD_QINT_TO_F16_BLK2(I8, vxc_char16)\n\
+\n\
+__kernel void depth2space_crd_I16toF16_blk2(\n\
+    image2d_array_t input, image2d_array_t output, int block_size)\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz);\n\
+    int4 coord_in = coord_out >> 1;\n\
+    coord_in.z = (gidy & 1) * 2 + gidz * 4;\n\
+    coord_in.w = coord_in.z + 1;\n\
+    vxc_short8 src0, src1, data0, data1, dst0, dst1;\n\
+    vxc_half8 tmpDst0, tmpDst1;\n\
+    VXC_ReadImage2DArray(src0, input, coord_in.xyzz,\n\
+             VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src1, input, coord_in.xyww,\n\
+             VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_DP2x8(data0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_lo_2x8);\n\
+    VXC_DP2x8(data1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_hi_2x8);\n\
+\n\
+    vxc_ushort8 ms0;\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\
+    VXC_DP2x8(tmpDst0, data0, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8);\n\
+    VXC_DP2x8(tmpDst1, data1, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8);\n\
+    _viv_asm(COPY, dst0, tmpDst0, 16);\n\
+    _viv_asm(COPY, dst1, tmpDst1, 16);\n\
+    VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    coord_out.x += 8;\n\
+    VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+#define DEPTH2SPACE_CRD_F16_TO_QINT_BLK2(src1_type_name, write_type) \\\n\
+__kernel void depth2space_crd_F16to##src1_type_name##_blk2( \\\n\
+    image2d_array_t input, image2d_array_t output, int block_size) \\\n\
+{ \\\n\
+    int gidy = get_global_id(1); \\\n\
+    int gidz = get_global_id(2); \\\n\
+    int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz); \\\n\
+    int4 coord_in = coord_out >> 1; \\\n\
+    coord_in.z = (gidy & 1) * 2 + gidz * 4; \\\n\
+    coord_in.w = coord_in.z + 1; \\\n\
+    vxc_short8 src0, src1, data0, data1; \\\n\
+    vxc_half8 tmpDst0, tmpDst1; \\\n\
+    VXC_ReadImage2DArray(src0, input, coord_in.xyzz, \\\n\
+             VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage2DArray(src1, input, coord_in.xyww, \\\n\
+             VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_DP2x8(data0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_lo_2x8); \\\n\
+    VXC_DP2x8(data1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_hi_2x8); \\\n\
+ \\\n\
+    write_type  dst; \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    _viv_asm(COPY, tmpDst0, data0, 16); \\\n\
+    _viv_asm(COPY, tmpDst1, data1, 16); \\\n\
+    VXC_DP2x8(dst, tmpDst0, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+    VXC_DP2x8(dst, tmpDst1, ms0, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+DEPTH2SPACE_CRD_F16_TO_QINT_BLK2(U8, vxc_uchar16)\n\
+DEPTH2SPACE_CRD_F16_TO_QINT_BLK2(I8, vxc_char16)\n\
+\n\
+__kernel void depth2space_crd_F16toI16_blk2(\n\
+    image2d_array_t input, image2d_array_t output, int block_size)\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord_out = (int4)(get_global_id(0), gidy, gidz, gidz);\n\
+    int4 coord_in = coord_out >> 1;\n\
+    coord_in.z = (gidy & 1) * 2 + gidz * 4;\n\
+    coord_in.w = coord_in.z + 1;\n\
+    vxc_short8 src0, src1, data0, data1, dst0, dst1;\n\
+    vxc_half8 tmpDst0, tmpDst1;\n\
+    VXC_ReadImage2DArray(src0, input, coord_in.xyzz,\n\
+             VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage2DArray(src1, input, coord_in.xyww,\n\
+             VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    vxc_ushort8 ms0;\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\
+    VXC_DP2x8(data0, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_lo_2x8);\n\
+    VXC_DP2x8(data1, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniDepth2SpaceF16Blk2_hi_2x8);\n\
+    _viv_asm(COPY, tmpDst0, data0, 16);\n\
+    _viv_asm(COPY, tmpDst1, data1, 16);\n\
+    VXC_DP2x8(dst0, tmpDst0, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8);\n\
+    VXC_DP2x8(dst1, tmpDst1, ms0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 1), uniU8MulAndPostShift_0_Lo_2x8);\n\
+\n\
+    VXC_WriteImage2DArray(output, coord_out, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    coord_out.x += 8;\n\
+    VXC_WriteImage2DArray(output, coord_out, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}"; /* end of depth2space_crd_vx*/
 
 static const char depthwise_conv1d_src0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -3322,6 +4105,11 @@ float4 eltwise_unary_mish(float4 x)\n\
     return x;\n\
 }\n\
 \n\
+float4 eltwise_unary_round(float4 x)\n\
+{\n\
+    return convert_float4(convert_int4_rte(x));\n\
+}\n\
+\n\
 _viv_uniform float inputScale;\n\
 _viv_uniform float inputTail;\n\
 _viv_uniform float outputScale;\n\
@@ -3442,7 +4230,17 @@ ELTSISE_UNARY_2D(hard_sigmoid, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_ucha
 ELTSISE_UNARY_2D(hard_sigmoid, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
 ELTSISE_UNARY_2D(hard_sigmoid, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
 ELTSISE_UNARY_2D(hard_sigmoid, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
-\n\
+//ROUND\n\
+ELTSISE_UNARY_2D(round, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(round, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_2D(round, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_2D(round, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_2D(round, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_2D(round, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(round, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_2D(round, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(round, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_2D(round, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
 \n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
@@ -3490,6 +4288,8 @@ ELTSISE_UNARY_BF16_2D(neg)\n\
 ELTSISE_UNARY_BF16_2D(mish)\n\
 //HARD_SIGMOID\n\
 ELTSISE_UNARY_BF16_2D(hard_sigmoid)\n\
+//ROUND\n\
+ELTSISE_UNARY_BF16_2D(round)\n\
 "; /* end of eltwise_unary_2d_vx*/
 
 static const char eltwise_unary_3d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -3561,6 +4361,11 @@ float4 eltwise_unary_mish(float4 x)\n\
     return x;\n\
 }\n\
 \n\
+float4 eltwise_unary_round(float4 x)\n\
+{\n\
+    return convert_float4(convert_int4_rte(x));\n\
+}\n\
+\n\
 _viv_uniform float inputScale;\n\
 _viv_uniform float inputTail;\n\
 _viv_uniform float outputScale;\n\
@@ -3681,6 +4486,17 @@ ELTSISE_UNARY_3D(hard_sigmoid, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_ucha
 ELTSISE_UNARY_3D(hard_sigmoid, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
 ELTSISE_UNARY_3D(hard_sigmoid, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
 ELTSISE_UNARY_3D(hard_sigmoid, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
+//ROUND\n\
+ELTSISE_UNARY_3D(round, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(round, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_3D(round, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_3D(round, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_3D(round, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_3D(round, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(round, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_3D(round, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(round, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_3D(round, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
 \n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
@@ -3726,7 +4542,184 @@ ELTSISE_UNARY_BF16(neg)\n\
 //MISH\n\
 ELTSISE_UNARY_BF16(mish)\n\
 //HARD_SIGMOID\n\
-ELTSISE_UNARY_BF16(hard_sigmoid)"; /* end of eltwise_unary_3d_vx*/
+ELTSISE_UNARY_BF16(hard_sigmoid)\n\
+//ROUND\n\
+ELTSISE_UNARY_BF16(round)"; /* end of eltwise_unary_3d_vx*/
+
+static const char erf_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+#define MUL2_RSQRTPI    (1.1283791670955126f)\n\
+float eltwise_unary_erf(float x)\n\
+{\n\
+    float res = 0;\n\
+    float tmp = x;\n\
+    float factorial = 1;\n\
+    float x_pow = x;\n\
+    float one = 1.0f;\n\
+    float n = 1;\n\
+\n\
+    while (fabs(tmp) > 1e-5)\n\
+    {\n\
+        res += tmp;\n\
+\n\
+        factorial *= n;\n\
+        one *= -1;\n\
+        x_pow *= x * x;\n\
+        tmp = one / factorial * x_pow / ( 2 * n + 1);\n\
+\n\
+        n += 1.0f;\n\
+    }\n\
+    return res * MUL2_RSQRTPI;\n\
+}\n\
+\n\
+_viv_uniform float inputScale;\n\
+_viv_uniform float inputTail;\n\
+_viv_uniform float outputScale;\n\
+_viv_uniform float outputZP;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform VXC_512Bits uniDatatoFp32Part0_4x4;\n\
+\n\
+#define ELTSISE_UNARY_2D(func_name, src_type_name, dst_type_name, src_type, \\\n\
+        src_copy_type, convert_type, dst_type, dst_copy_type) \\\n\
+    __kernel void func_name##_##src_type_name##to##dst_type_name##_2D( \\\n\
+    __read_only  image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    src_type      src0; \\\n\
+    src_copy_type src1; \\\n\
+    VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src1, src0, 16); \\\n\
+ \\\n\
+    float4 vecA; \\\n\
+    VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \\\n\
+    vecA = vecA * inputScale + inputTail; \\\n\
+    vecA.x = eltwise_unary_##func_name(vecA.x); \\\n\
+    vecA.y = eltwise_unary_##func_name(vecA.y); \\\n\
+    vecA.z = eltwise_unary_##func_name(vecA.z); \\\n\
+    vecA.w = eltwise_unary_##func_name(vecA.w); \\\n\
+    vecA = vecA * outputScale + outputZP; \\\n\
+ \\\n\
+    convert_type dst0; \\\n\
+    _viv_asm(CONV_RTE, dst0, vecA); \\\n\
+    dst_type dst2; \\\n\
+    VXC_DP2x8(dst2, dst0, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    dst_copy_type dst; \\\n\
+    _viv_asm(COPY, dst, dst2, 16); \\\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+//ERF\n\
+ELTSISE_UNARY_2D(erf, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(erf, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_2D(erf, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_2D(erf, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_2D(erf, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_2D(erf, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(erf, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_2D(erf, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_2D(erf, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_2D(erf, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
+\n\
+\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
+\n\
+#define ELTSISE_UNARY_BF16_2D(func_name) \\\n\
+    __kernel void func_name##_BF16toBF16_2D( \\\n\
+    __read_only  image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    vxc_ushort8   src0, src1, dst; \\\n\
+    VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 vecA; \\\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\
+    VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\
+    _viv_asm(COPY, vecA, src1, 16); \\\n\
+    vecA.x = eltwise_unary_##func_name(vecA.x); \\\n\
+    vecA.y = eltwise_unary_##func_name(vecA.y); \\\n\
+    vecA.z = eltwise_unary_##func_name(vecA.z); \\\n\
+    vecA.w = eltwise_unary_##func_name(vecA.w); \\\n\
+ \\\n\
+    _viv_asm(COPY, src0, vecA, 16); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+//EXP\n\
+ELTSISE_UNARY_BF16_2D(erf)\n\
+\n\
+#define ELTSISE_UNARY_3D(func_name, src_type_name, dst_type_name, src_type, \\\n\
+    src_copy_type, convert_type, dst_type, dst_copy_type) \\\n\
+__kernel void func_name##_##src_type_name##to##dst_type_name( \\\n\
+__read_only  image2d_array_t  input, \\\n\
+__write_only image2d_array_t  output \\\n\
+) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    src_type      src0; \\\n\
+    src_copy_type src1; \\\n\
+    VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src1, src0, 16); \\\n\
+ \\\n\
+    float4 vecA; \\\n\
+    VXC_DP4x4(vecA, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDatatoFp32Part0_4x4); \\\n\
+    vecA = vecA * inputScale + inputTail; \\\n\
+    vecA.x = eltwise_unary_##func_name(vecA.x); \\\n\
+    vecA.y = eltwise_unary_##func_name(vecA.y); \\\n\
+    vecA.z = eltwise_unary_##func_name(vecA.z); \\\n\
+    vecA.w = eltwise_unary_##func_name(vecA.w); \\\n\
+    vecA = vecA * outputScale + outputZP; \\\n\
+ \\\n\
+    convert_type dst0; \\\n\
+    _viv_asm(CONV_RTE, dst0, vecA); \\\n\
+    dst_type dst2; \\\n\
+    VXC_DP2x8(dst2, dst0, dst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    dst_copy_type dst; \\\n\
+    _viv_asm(COPY, dst, dst2, 16); \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+//ERF\n\
+ELTSISE_UNARY_3D(erf, F16, F16, vxc_short8, vxc_half8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(erf, F16, I8,  vxc_short8, vxc_half8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_3D(erf, F16, U8,  vxc_short8, vxc_half8,  int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_3D(erf, F16, I16, vxc_short8, vxc_half8,  int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_3D(erf, I8,  I8,  vxc_char8,  vxc_char8,  int4,  vxc_char8,  vxc_char8)\n\
+ELTSISE_UNARY_3D(erf, I8,  F16, vxc_char8,  vxc_char8,  half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(erf, U8,  U8,  vxc_uchar8, vxc_uchar8, int4,  vxc_uchar8, vxc_uchar8)\n\
+ELTSISE_UNARY_3D(erf, U8,  F16, vxc_uchar8, vxc_uchar8, half4, vxc_half8,  vxc_short8)\n\
+ELTSISE_UNARY_3D(erf, I16, I16, vxc_short8, vxc_short8, int4,  vxc_short8, vxc_short8)\n\
+ELTSISE_UNARY_3D(erf, I16, F16, vxc_short8, vxc_short8, half4, vxc_half8,  vxc_short8)\n\
+\n\
+#define ELTSISE_UNARY_BF16_3D(func_name) \\\n\
+    __kernel void func_name##_BF16toBF16( \\\n\
+    __read_only  image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    vxc_ushort8   src0, src1, dst; \\\n\
+    VXC_ReadImage2DArray(src0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 vecA; \\\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\
+    VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\
+    _viv_asm(COPY, vecA, src1, 16); \\\n\
+    vecA.x = eltwise_unary_##func_name(vecA.x); \\\n\
+    vecA.y = eltwise_unary_##func_name(vecA.y); \\\n\
+    vecA.z = eltwise_unary_##func_name(vecA.z); \\\n\
+    vecA.w = eltwise_unary_##func_name(vecA.w); \\\n\
+ \\\n\
+    _viv_asm(COPY, src0, vecA, 16); \\\n\
+ \\\n\
+    VXC_DP2x8(dst, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8); \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+//ERF\n\
+ELTSISE_UNARY_BF16_3D(erf)"; /* end of erf_vx*/
 
 static const char floordiv_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -4113,6 +5106,164 @@ __kernel void gather_F16toF16_axis0(\n\
 }\n\
 "; /* end of gather_vx*/
 
+static const char gather_array_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int indices_num;\n\
+_viv_uniform VXC_512Bits uniExtraCopyDpKeepinEvis_2x8;\n\
+\n\
+__kernel void gather_I8toI8_array(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+    int gidz = get_global_id(2);  // block_num\n\
+\n\
+    int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
+    int4 indice = read_imagei(input1, coord_in.xy);\n\
+    coord_in.w = gidz * axis_num + indice.x;\n\
+\n\
+    Image img1 = create_image_from_image2d(input0, 1);\n\
+    Image img2 = create_image_from_image2d(output, 1);\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\
+    __global vxc_char16* data_ptr = (__global vxc_char16*)input_ptr;\n\
+    vxc_char16 src = data_ptr[0];\n\
+    int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\
+    __global vxc_char16* dst_ptr = (__global vxc_char16*)output_ptr;\n\
+    dst_ptr[0] = src;\n\
+}\n\
+\n\
+__kernel void gather_U8toU8_array(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+    int gidz = get_global_id(2);  // block_num\n\
+\n\
+    int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
+    int4 indice = read_imagei(input1, coord_in.xy);\n\
+    coord_in.w = gidz * axis_num + indice.x;\n\
+\n\
+    Image img1 = create_image_from_image2d(input0, 1);\n\
+    Image img2 = create_image_from_image2d(output, 1);\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\
+    __global vxc_uchar16* data_ptr = (__global vxc_uchar16*)input_ptr;\n\
+    vxc_uchar16 src = data_ptr[0];\n\
+    int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\
+    __global vxc_uchar16* dst_ptr = (__global vxc_uchar16*)output_ptr;\n\
+    dst_ptr[0] = src;\n\
+}\n\
+\n\
+__kernel void gather_I16toI16_array(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+    int gidz = get_global_id(2);  // block_num\n\
+\n\
+    int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
+\n\
+\n\
+    int4 indice = read_imagei(input1, coord_in.xy);\n\
+    coord_in.w = gidz * axis_num + indice.x;\n\
+\n\
+    Image img1 = create_image_from_image2d(input0, 2);\n\
+    Image img2 = create_image_from_image2d(output, 2);\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\
+    __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr;\n\
+    vxc_short8 src = data_ptr[0];\n\
+    int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\
+    __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr;\n\
+    dst_ptr[0] = src;\n\
+}\n\
+\n\
+__kernel void gather_F16toF16_array(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int block_num,\n\
+    int axis_num\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+    int gidz = get_global_id(2);  // block_num\n\
+\n\
+    int4 coord_in = (int4)(gidy, 0, gidx, 0);\n\
+\n\
+    int4 indice = read_imagei(input1, coord_in.xy);\n\
+    coord_in.w = gidz * axis_num + indice.x;\n\
+\n\
+    Image img1 = create_image_from_image2d(input0, 2);\n\
+    Image img2 = create_image_from_image2d(output, 2);\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\
+    __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr;\n\
+    vxc_short8 src = data_ptr[0];\n\
+    int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\
+    __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr;\n\
+    dst_ptr[0] = src;\n\
+}\n\
+\n\
+#define GATHER_AXIS0_ARRAY(src0_type_name, read_type, data_type, write_type) \\\n\
+__kernel void gather_##src0_type_name##to##src0_type_name##_axis0_array( \\\n\
+    __read_only image2d_t   input0, \\\n\
+    __read_only image2d_t   input1, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int block_size, \\\n\
+    int block_num, \\\n\
+    int axis_num \\\n\
+    ) \\\n\
+{ \\\n\
+    Image img0 = create_image_from_image2d(input0, 1); \\\n\
+    Image img1 = create_image_from_image2d(input1, 4); \\\n\
+    Image img2 = create_image_from_image2d(output, 1); \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\
+    uchar* index_ptr = get_image_ptr_from_coord(img1, coord.xz); \\\n\
+    __global int* index = (__global int*)index_ptr; \\\n\
+    int4 indices = vload4(0, index); \\\n\
+ \\\n\
+    read_type src, dst; \\\n\
+ \\\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img0, coord.zy); \\\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.xy); \\\n\
+    __global data_type* data_ptr = (__global data_type*)input_ptr; \\\n\
+    __global write_type* out_ptr = (__global write_type*)output_ptr; \\\n\
+    src.s0 = data_ptr[indices.x]; \\\n\
+    src.s1 = data_ptr[indices.y]; \\\n\
+    src.s2 = data_ptr[indices.z]; \\\n\
+    src.s3 = data_ptr[indices.w]; \\\n\
+ \\\n\
+    VXC_DP2x8(dst, src, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+                     uniExtraCopyDpKeepinEvis_2x8); \\\n\
+    out_ptr[0] = dst.s0123; \\\n\
+}\n\
+GATHER_AXIS0_ARRAY(U8, vxc_uchar16, uchar, vxc_uchar4)\n\
+GATHER_AXIS0_ARRAY(I8, vxc_char16,  char, vxc_char4)\n\
+GATHER_AXIS0_ARRAY(I16, vxc_short8, short, vxc_short4)\n\
+GATHER_AXIS0_ARRAY(F16, vxc_short8, short, vxc_short4)"; /* end of gather_array_vx*/
+
 static const char gather_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int indices_num;\n\
@@ -4334,7 +5485,10 @@ __kernel void gather_nd_I8toI8_1D(\n\
     int gidy = get_global_id(1);  // indices_num\n\
 \n\
     int4 coord = (int4)(0, gidy, gidx, 0);\n\
-    int4 indice = read_imagei(input1, coord.xy);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
     coord.w = indice.x;\n\
 \n\
     vxc_char16 src;\n\
@@ -4355,7 +5509,10 @@ __kernel void gather_nd_U8toU8_1D(\n\
     int gidy = get_global_id(1);  // indices_num\n\
 \n\
     int4 coord = (int4)(0, gidy, gidx, 0);\n\
-    int4 indice = read_imagei(input1, coord.xy);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
     coord.w = indice.x;\n\
 \n\
     vxc_uchar16 src;\n\
@@ -4375,7 +5532,10 @@ __kernel void gather_nd_I16toI16_1D(\n\
     int gidy = get_global_id(1);  // indices_num\n\
 \n\
     int4 coord = (int4)(0, gidy, gidx, 0);\n\
-    int4 indice = read_imagei(input1, coord.xy);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
     coord.w = indice.x;\n\
 \n\
     vxc_short8 src;\n\
@@ -4395,7 +5555,10 @@ __kernel void gather_nd_F16toF16_1D(\n\
     int gidy = get_global_id(1);  // indices_num\n\
 \n\
     int4 coord = (int4)(0, gidy, gidx, 0);\n\
-    int4 indice = read_imagei(input1, coord.xy);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
     coord.w = indice.x;\n\
 \n\
     vxc_short8 src;\n\
@@ -4418,7 +5581,10 @@ __kernel void gather_nd_I8toI8_2D(\n\
     int gidy = get_global_id(1);  // indices_num\n\
 \n\
     int4 coord = (int4)(0, gidy, gidx, 0);\n\
-    int4 indice = read_imagei(input1, coord.xy);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
     indice.x = indice.x * block_size + gidx;\n\
 \n\
     vxc_char16 src;\n\
@@ -4439,7 +5605,10 @@ __kernel void gather_nd_U8toU8_2D(\n\
     int gidy = get_global_id(1);  // indices_num\n\
 \n\
     int4 coord = (int4)(0, gidy, gidx, 0);\n\
-    int4 indice = read_imagei(input1, coord.xy);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
     indice.x = indice.x * block_size + gidx;\n\
 \n\
     vxc_uchar16 src;\n\
@@ -4459,7 +5628,10 @@ __kernel void gather_nd_I16toI16_2D(\n\
     int gidy = get_global_id(1);  // indices_num\n\
 \n\
     int4 coord = (int4)(0, gidy, gidx, 0);\n\
-    int4 indice = read_imagei(input1, coord.xy);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
     indice.x = indice.x * block_size + gidx;\n\
 \n\
     vxc_short8 src;\n\
@@ -4479,7 +5651,10 @@ __kernel void gather_nd_F16toF16_2D(\n\
     int gidy = get_global_id(1);  // indices_num\n\
 \n\
     int4 coord = (int4)(0, gidy, gidx, 0);\n\
-    int4 indice = read_imagei(input1, coord.xy);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
     indice.x = indice.x * block_size + gidx;\n\
 \n\
     vxc_short8 src;\n\
@@ -4516,7 +5691,10 @@ __kernel void gather_nd_##src0_type_name##toF16_2D( \\\n\
     int gidy = get_global_id(1); \\\n\
  \\\n\
     int4 coord = (int4)(0, gidy, gidx, 0); \\\n\
-    int4 indice = read_imagei(input1, coord.xy); \\\n\
+    Image img = create_image_from_image2d(input1, 4); \\\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\
+    int4 indice = ((int4 *)indice_ptr)[0]; \\\n\
+ \\\n\
     indice.x = indice.x * block_size + gidx; \\\n\
  \\\n\
     read_type src; \\\n\
@@ -4547,7 +5725,10 @@ __kernel void gather_nd_F16to##src1_type_name##_2D( \\\n\
     int gidy = get_global_id(1); \\\n\
  \\\n\
     int4 coord = (int4)(0, gidy, gidx, 0); \\\n\
-    int4 indice = read_imagei(input1, coord.xy); \\\n\
+    Image img = create_image_from_image2d(input1, 4); \\\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\
+    int4 indice = ((int4 *)indice_ptr)[0]; \\\n\
+ \\\n\
     indice.x = indice.x * block_size + gidx; \\\n\
  \\\n\
     vxc_short8 src; \\\n\
@@ -4580,7 +5761,10 @@ __kernel void gather_nd_I8toI8_3D(\n\
     int gidy = get_global_id(1);  // indices_num\n\
 \n\
     int4 coord = (int4)(0, gidy, gidx, 0);\n\
-    int4 indice = read_imagei(input1, coord.xy);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
     indice.x = indice.x * block_size + gidx;\n\
     indice.w = 0;\n\
 \n\
@@ -4602,7 +5786,11 @@ __kernel void gather_nd_U8toU8_3D(\n\
     int gidy = get_global_id(1);  // indices_num\n\
 \n\
     int4 coord = (int4)(0, gidy, gidx, 0);\n\
-    int4 indice = read_imagei(input1, coord.xy);\n\
+\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
     indice.x = indice.x * block_size + gidx;\n\
     indice.w = 0;\n\
 \n\
@@ -4623,7 +5811,10 @@ __kernel void gather_nd_I16toI16_3D(\n\
     int gidy = get_global_id(1);  // indices_num\n\
 \n\
     int4 coord = (int4)(0, gidy, gidx, 0);\n\
-    int4 indice = read_imagei(input1, coord.xy);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
     indice.x = indice.x * block_size + gidx;\n\
     indice.w = 0;\n\
 \n\
@@ -4644,7 +5835,10 @@ __kernel void gather_nd_F16toF16_3D(\n\
     int gidy = get_global_id(1);  // indices_num\n\
 \n\
     int4 coord = (int4)(0, gidy, gidx, 0);\n\
-    int4 indice = read_imagei(input1, coord.xy);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
     indice.x = indice.x * block_size + gidx;\n\
     indice.w = 0;\n\
 \n\
@@ -4652,6 +5846,7 @@ __kernel void gather_nd_F16toF16_3D(\n\
     VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
     VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
+\n\
 "; /* end of gather_nd_3d_vx*/
 
 static const char gather_nd_3d_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -4679,7 +5874,10 @@ __kernel void gather_nd_##src0_type_name##toF16_3D( \\\n\
     int gidy = get_global_id(1); \\\n\
  \\\n\
     int4 coord = (int4)(0, gidy, gidx, 0); \\\n\
-    int4 indice = read_imagei(input1, coord.xy); \\\n\
+    Image img = create_image_from_image2d(input1, 4); \\\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\
+    int4 indice = ((int4 *)indice_ptr)[0]; \\\n\
+ \\\n\
     indice.x = indice.x * block_size + gidx; \\\n\
     indice.w = 0; \\\n\
  \\\n\
@@ -4711,7 +5909,10 @@ __kernel void gather_nd_F16to##src1_type_name##_3D( \\\n\
     int gidy = get_global_id(1); \\\n\
  \\\n\
     int4 coord = (int4)(0, gidy, gidx, 0); \\\n\
-    int4 indice = read_imagei(input1, coord.xy); \\\n\
+    Image img = create_image_from_image2d(input1, 4); \\\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\
+    int4 indice = ((int4 *)indice_ptr)[0]; \\\n\
+ \\\n\
     indice.x = indice.x * block_size + gidx; \\\n\
     indice.w = 0; \\\n\
  \\\n\
@@ -4760,7 +5961,10 @@ __kernel void gather_nd_##src0_type_name##toF16_1D( \\\n\
     int gidy = get_global_id(1); \\\n\
  \\\n\
     int4 coord = (int4)(0, gidy, gidx, 0); \\\n\
-    int4 indice = read_imagei(input1, coord.xy); \\\n\
+    Image img = create_image_from_image2d(input1, 4); \\\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\
+    int4 indice = ((int4 *)indice_ptr)[0]; \\\n\
+ \\\n\
     coord.w = indice.x; \\\n\
  \\\n\
     read_type src; \\\n\
@@ -4791,7 +5995,10 @@ __kernel void gather_nd_F16to##src1_type_name##_1D( \\\n\
     int gidy = get_global_id(1); \\\n\
  \\\n\
     int4 coord = (int4)(0, gidy, gidx, 0); \\\n\
-    int4 indice = read_imagei(input1, coord.xy); \\\n\
+    Image img = create_image_from_image2d(input1, 4); \\\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\
+    int4 indice = ((int4 *)indice_ptr)[0]; \\\n\
+ \\\n\
     coord.w = indice.x; \\\n\
  \\\n\
     vxc_short8 src; \\\n\
@@ -4811,6 +6018,1350 @@ GATHER_ND_F16_TO_QINT_1D(I16, vxc_short8)\n\
 \n\
 "; /* end of gather_nd_mix_vx*/
 
+static const char group_normalization_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int width;\n\
+_viv_uniform int height;\n\
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\
+_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;\n\
+\n\
+_viv_uniform float outputScale;\n\
+_viv_uniform int output_ZP;\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_F16(\n\
+    image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\
+{\n\
+    int gidx = get_global_id(0) << 3;\n\
+    int lidx = get_local_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
+    vxc_short8 src0;\n\
+    vxc_half8 in_h;\n\
+    vxc_float4 sumsqr;\n\
+    vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+\n\
+    if(gidx < width)\n\
+    {\n\
+        for(coord.y = 0; coord.y < height;)\n\
+        {\n\
+            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+            _viv_asm(COPY, in_h, src0, 16);\n\
+            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniFp16SumSqr_dp8x2);\n\
+            tmpSumSqr += sumsqr;\n\
+        }\n\
+    }\n\
+\n\
+    lcl_sum[lidx] = tmpSumSqr.x;\n\
+    lcl_sqr[lidx] = tmpSumSqr.y;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+\n\
+        float sum = 0;\n\
+        float sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            //sum += lcl_sum[i];\n\
+            //sqr += lcl_sqr[i];\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+\n\
+        float4 data = (float4)(sum, sqr, 0, 0);\n\
+        write_imagef(output, coord_out, data);\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_F16_2D(\n\
+    image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\
+{\n\
+    int gidx = get_global_id(0) << 3;\n\
+    int lidx = get_local_id(0);\n\
+\n\
+    int2 coord = (int2)(gidx, get_global_id(1));\n\
+    vxc_short8 src0;\n\
+    vxc_half8 in_h;\n\
+    vxc_float4 sumsqr = (vxc_float4)(0);\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    if(gidx < width)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        _viv_asm(COPY, in_h, src0, 16);\n\
+        VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniFp16SumSqr_dp8x2);\n\
+    }\n\
+\n\
+    lcl_sum[lidx] = sumsqr.x;\n\
+    lcl_sqr[lidx] = sumsqr.y;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+\n\
+        float sum = 0;\n\
+        float sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            //sum += lcl_sum[i];\n\
+            //sqr += lcl_sqr[i];\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+\n\
+        float4 data = (float4)(sum, sqr, 0, 0);\n\
+        write_imagef(output, coord_out, data);\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toF16(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\
+    float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
+    vxc_short8 src0;\n\
+    vxc_short8 src1;\n\
+    vxc_half8 scale_h, in_h;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, scale_h, src1, 16);\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    vxc_short8 outval;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
+    vxc_half8 dst;\n\
+\n\
+    _viv_asm(COPY, in_h, src0, 16);\n\
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertEndInt16Fp32_4x4);\n\
+\n\
+    vxc_float4 norm;\n\
+    norm = scale_vari * tmpData0 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = scale_vari * tmpData1 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toF16_2D(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\
+    float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int2 coord = (int2)(get_global_id(0), gidz);\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
+    vxc_short8 src0;\n\
+    vxc_short8 src1;\n\
+    vxc_half8 scale_h, in_h;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, scale_h, src1, 16);\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    vxc_short8 outval;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
+    vxc_half8 dst;\n\
+\n\
+    _viv_asm(COPY, in_h, src0, 16);\n\
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        UniFP16toFP32Lo4_dp4x4);\n\
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertEndInt16Fp32_4x4);\n\
+    vxc_float4 norm;\n\
+    norm = scale_vari * tmpData0 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = scale_vari * tmpData1 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toU8(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\
+    float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
+    vxc_short8 src0;\n\
+    vxc_short8 src1;\n\
+    vxc_half8 scale_h, in_h;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, scale_h, src1, 16);\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    vxc_uchar16 outval;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+    float alpha = outputScale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\
+\n\
+    _viv_asm(COPY, in_h, src0, 16);\n\
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertEndInt16Fp32_4x4);\n\
+\n\
+    vxc_float4 norm;\n\
+    norm = alpha * tmpData0 + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = alpha * tmpData1 + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+            uniConvertInt32toUint8_2x8);\n\
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toU8_2D(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\
+    float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int2 coord = (int2)(get_global_id(0), gidz);\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
+    vxc_short8 src0;\n\
+    vxc_short8 src1;\n\
+    vxc_half8 scale_h, in_h;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, scale_h, src1, 16);\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    vxc_uchar16 outval;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+    float alpha = outputScale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\
+\n\
+    _viv_asm(COPY, in_h, src0, 16);\n\
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertEndInt16Fp32_4x4);\n\
+    vxc_float4 norm;\n\
+    norm = alpha * tmpData0 + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = alpha * tmpData1 + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+            uniConvertInt32toUint8_2x8);\n\
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}"; /* end of group_normalization_f16_vx*/
+
+static const char group_normalization_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int width;\n\
+_viv_uniform int height;\n\
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
+\n\
+_viv_uniform float inFlScale_s2;\n\
+_viv_uniform float input_fl_scale;\n\
+_viv_uniform float inOut_fl_scale;\n\
+_viv_uniform float output_fl_scale;\n\
+\n\
+_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\
+_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8;\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I16(\n\
+    image2d_array_t input,\n\
+    image2d_array_t output,\n\
+              float eps,\n\
+              int is2D)\n\
+{\n\
+    int gidx = get_global_id(0) << 3;\n\
+    int lidx = get_local_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
+    vxc_short8 src0;\n\
+    float sum = 0, sqr = 0;\n\
+    vxc_float4 sumsqr = (vxc_float4)(0);\n\
+    vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+    if(gidx < width)\n\
+    {\n\
+        for(coord.y = 0; coord.y < height;)\n\
+        {\n\
+            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+            VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniInt16SumSqr_dp8x2);\n\
+            //tmpSumSqr += sumsqr;\n\
+            tmpSumSqr.x += sumsqr.x;\n\
+            sqr += (sumsqr.y * inFlScale_s2);\n\
+        }\n\
+        sum = tmpSumSqr.x * input_fl_scale;\n\
+        //sqr = tmpSumSqr.y * inFlScale_s2;\n\
+    }\n\
+\n\
+    lcl_sum[lidx] = sum;\n\
+    lcl_sqr[lidx] = sqr;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+\n\
+        sum = 0; sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            //sum += lcl_sum[i];\n\
+            //sqr += lcl_sqr[i];\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+\n\
+        float4 data = (float4)(sum, sqr, 0, 0);\n\
+        write_imagef(output, coord_out, data);\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I16_2D(\n\
+    image2d_array_t input,\n\
+    image2d_array_t output,\n\
+              float eps,\n\
+              int is2D)\n\
+{\n\
+    int gidx = get_global_id(0) << 3;\n\
+    int lidx = get_local_id(0);\n\
+    int gidz = get_global_id(1);\n\
+\n\
+    int2 coord = (int2)(gidx, gidz);\n\
+    vxc_short8 src0;\n\
+    float sum = 0, sqr = 0;\n\
+    vxc_float4 sumsqr = (vxc_float4)(0);\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    if(gidx < width)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniInt16SumSqr_dp8x2);\n\
+        sqr = sumsqr.y * inFlScale_s2;\n\
+        sum = sumsqr.x * input_fl_scale;\n\
+        //sqr = tmpSumSqr.y * inFlScale_s2;\n\
+    }\n\
+\n\
+    lcl_sum[lidx] = sum;\n\
+    lcl_sqr[lidx] = sqr;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+\n\
+        sum = 0; sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            //sum += lcl_sum[i];\n\
+            //sqr += lcl_sqr[i];\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+\n\
+        float4 data = (float4)(sum, sqr, 0, 0);\n\
+        write_imagef(output, coord_out, data);\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toF16(\n\
+    image2d_array_t input,\n\
+    image2d_t bias,\n\
+    image2d_t scale,\n\
+    image2d_t meanVari,\n\
+    image2d_array_t output,\n\
+              float eps,\n\
+              int is2D,\n\
+              float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
+    vxc_short8 src0;\n\
+    vxc_short8 src1;\n\
+    vxc_half8 scale_h;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, scale_h, src1, 16);\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        UniFP16toFP32Lo4_dp4x4);\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    vxc_short8 outval;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    float alpha = input_fl_scale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
+    vxc_half8 dst;\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertInt16Fp32Fst_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertInt16Fp32Secd_4x4);\n\
+\n\
+    vxc_float4 norm;\n\
+    norm = alpha * tmpData0 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData1 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toF16_2D(\n\
+    image2d_array_t input,\n\
+    image2d_t bias,\n\
+    image2d_t scale,\n\
+    image2d_t meanVari,\n\
+    image2d_array_t output,\n\
+              float eps,\n\
+              int is2D,\n\
+              float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int2 coord = (int2)(get_global_id(0), gidz);\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
+    vxc_short8 src0;\n\
+    vxc_short8 src1;\n\
+    vxc_half8 scale_h;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, scale_h, src1, 16);\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    vxc_short8 outval;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    float alpha = input_fl_scale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
+    vxc_half8 dst;\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertInt16Fp32Fst_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertInt16Fp32Secd_4x4);\n\
+    vxc_float4 norm;\n\
+    norm = alpha * tmpData0 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData1 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toI16(\n\
+    image2d_array_t input,\n\
+    image2d_t bias,\n\
+    image2d_t scale,\n\
+    image2d_t meanVari,\n\
+    image2d_array_t output,\n\
+              float eps,\n\
+              int is2D,\n\
+              float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
+    vxc_short8 src0, src2;\n\
+    vxc_short8 src1;\n\
+    vxc_half8 scale_h;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, scale_h, src1, 16);\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    float alpha = inOut_fl_scale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertInt16Fp32Fst_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertInt16Fp32Secd_4x4);\n\
+    vxc_float4 norm;\n\
+    norm = tmpData0 * alpha + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = tmpData1 * alpha + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+        uniConvertInt32toInt16_2x8);\n\
+    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toI16_2D(\n\
+    image2d_array_t input,\n\
+    image2d_t bias,\n\
+    image2d_t scale,\n\
+    image2d_t meanVari,\n\
+    image2d_array_t output,\n\
+              float eps,\n\
+              int is2D,\n\
+              float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int2 coord = (int2)(get_global_id(0), gidz);\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
+    vxc_short8 src0, src2;\n\
+    vxc_short8 src1;\n\
+    vxc_half8 scale_h;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, scale_h, src1, 16);\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    float alpha = inOut_fl_scale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertInt16Fp32Fst_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertInt16Fp32Secd_4x4);\n\
+    vxc_float4 norm;\n\
+    norm = tmpData0 * alpha + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = tmpData1 * alpha + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+            uniConvertInt32toInt16_2x8);\n\
+    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}"; /* end of group_normalization_i16_vx*/
+
+static const char group_normalization_i8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int width;\n\
+_viv_uniform int height;\n\
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniSumInt8_16x1;\n\
+_viv_uniform VXC_512Bits uniSqrSumInt8_16x1;\n\
+_viv_uniform float inFlScale_s2;\n\
+_viv_uniform float input_fl_scale;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertDirInt8Fp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertEndInt8Fp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertTrdInt8Fp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertFthInt8Fp32_4x4;\n\
+\n\
+_viv_uniform float inOut_fl_scale;\n\
+_viv_uniform float output_fl_scale;\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I8(\n\
+    image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\
+{\n\
+    int gidx = get_global_id(0) << 4;\n\
+    int lidx = get_local_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
+    vxc_char16 src0;\n\
+    float sum = 0, sqr = 0;\n\
+    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+\n\
+    if(gidx < width)\n\
+    {\n\
+        for(coord.y = 0; coord.y < height;)\n\
+        {\n\
+            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1);\n\
+            tmpSum += (tmpSum1);\n\
+            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1);\n\
+            tmpSqr += (tmpSqr1);\n\
+        }\n\
+        sqr = tmpSqr * inFlScale_s2;\n\
+        sum = tmpSum * input_fl_scale;\n\
+    }\n\
+    lcl_sum[lidx] = sum;\n\
+    lcl_sqr[lidx] = sqr;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+\n\
+        sum = 0; sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+\n\
+        float4 data = (float4)(sum, sqr, 0, 0);\n\
+        write_imagef(output, coord_out, data);\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_I8_2D(\n\
+    image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\
+{\n\
+    int gidx = get_global_id(0) << 4;\n\
+    int lidx = get_local_id(0);\n\
+    int gidz = get_global_id(1);\n\
+\n\
+    int2 coord = (int2)(gidx, gidz);\n\
+    vxc_char16 src0;\n\
+    float sum = 0, sqr = 0;\n\
+    int tmpSum1, tmpSqr1;\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    if(gidx < width)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\n\
+                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1);\n\
+        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSumInt8_16x1);\n\
+        sqr = tmpSqr1 * inFlScale_s2;\n\
+        sum = tmpSum1 * input_fl_scale;\n\
+    }\n\
+\n\
+    lcl_sum[lidx] = sum;\n\
+    lcl_sqr[lidx] = sqr;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+\n\
+        sum = 0; sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+\n\
+        float4 data = (float4)(sum, sqr, 0, 0);\n\
+        write_imagef(output, coord_out, data);\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toF16(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
+    vxc_char16 src0;\n\
+    vxc_short8 src1, outval;\n\
+    vxc_half8 scale_h, dst;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, scale_h, src1, 16);\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    float alpha = input_fl_scale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
+\n\
+    norm = alpha * tmpData0 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData1 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    coord.x += 8;\n\
+    norm = alpha * tmpData2 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData3 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toF16_2D(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int2 coord = (int2)(get_global_id(0), gidz);\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
+    vxc_char16 src0;\n\
+    vxc_short8 src1, outval;\n\
+    vxc_half8 scale_h, dst;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, scale_h, src1, 16);\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    float alpha = input_fl_scale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
+    norm = alpha * tmpData0 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData1 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    coord.x += 8;\n\
+    norm = alpha * tmpData2 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData3 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toI8(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
+    vxc_char16 src0, src2;\n\
+    vxc_short8 src1;\n\
+    vxc_half8 scale_h;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, scale_h, src1, 16);\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
+    float alpha = inOut_fl_scale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
+    norm = tmpData0 * alpha + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = tmpData1 * alpha + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    norm = tmpData2 * alpha + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = tmpData3 * alpha + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toI8_2D(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int2 coord = (int2)(get_global_id(0), gidz);\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
+    vxc_char16 src0, src2;\n\
+    vxc_short8 src1;\n\
+    vxc_half8 scale_h;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, scale_h, src1, 16);\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
+    float alpha = inOut_fl_scale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
+    norm = tmpData0 * alpha + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = tmpData1 * alpha + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    norm = tmpData2 * alpha + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = tmpData3 * alpha + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of group_normalization_i8_vx*/
+
+static const char group_normalization_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int width;\n\
+_viv_uniform int height;\n\
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniSumU8_16x1;\n\
+_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\
+_viv_uniform float input_scale;\n\
+_viv_uniform int inputZP;\n\
+_viv_uniform int sumInZp;\n\
+_viv_uniform int tmpZp1;\n\
+_viv_uniform float e2InScale;\n\
+_viv_uniform float rowSumScale;\n\
+_viv_uniform float scale_inOut;\n\
+_viv_uniform float outputScale;\n\
+_viv_uniform int output_ZP;\n\
+\n\
+_viv_uniform VXC_512Bits uniResetFp32_4x4;\n\
+_viv_uniform int group_stride;\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_U8(\n\
+    image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\
+{\n\
+    int gidx = get_global_id(0) << 4;\n\
+    int lidx = get_local_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
+    vxc_uchar16 src0;\n\
+    float sum = 0, sqr = 0;\n\
+    int tmpSum = 0, tmpSqr = 0, tmpSum1 = 0, tmpSqr1 = 0;\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+    if(gidx < width)\n\
+    {\n\
+        for(coord.y = 0; coord.y < height;)\n\
+        {\n\
+            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
+            tmpSum += (tmpSum1);\n\
+            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
+            tmpSqr += (tmpSqr1 + tmpZp1 * tmpSum1);\n\
+        }\n\
+        sqr += (tmpSqr * e2InScale + rowSumScale);\n\
+        sum = (tmpSum + sumInZp) * input_scale;\n\
+    }\n\
+    lcl_sum[lidx] = sum;\n\
+    lcl_sqr[lidx] = sqr;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+        sum = 0; sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+        float4 data = (float4)(sum, sqr, 0, 0);\n\
+        write_imagef(output, coord_out, data);\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_sumsqr_U8_2D(\n\
+    image2d_array_t input, image2d_array_t output, float eps, int is2D)\n\
+{\n\
+    int gidx = get_global_id(0) << 4;\n\
+    int lidx = get_local_id(0);\n\
+\n\
+    int2 coord = (int2)(gidx, get_global_id(1));\n\
+    vxc_uchar16 src0;\n\
+    float sum = 0, sqr = 0;\n\
+    int tmpSqr, tmpSum1, tmpSqr1;\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+    if(gidx < width)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
+                             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
+        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
+        tmpSqr = tmpSqr1 + tmpZp1 * tmpSum1;\n\
+        sqr = (tmpSqr * e2InScale + rowSumScale);\n\
+        sum = (tmpSum1 + sumInZp) * input_scale;\n\
+    }\n\
+    lcl_sum[lidx] = sum;\n\
+    lcl_sqr[lidx] = sqr;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, get_global_id(1), 0, 0);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+        sum = 0; sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+        float4 data = (float4)(sum, sqr, 0, 0);\n\
+        write_imagef(output, coord_out, data);\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_meanvari(\n\
+    image2d_t input, image2d_t output, float eps, float group_ratio)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int lidx = get_local_id(0);\n\
+\n\
+    int2 coord = (int2)(gidx, get_global_id(1));\n\
+    vxc_uchar16 src0;\n\
+    float2 sum_sqr = (float2)(0);\n\
+    vxc_float4 mean_vari;\n\
+    VXC_DP4x4(mean_vari, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniResetFp32_4x4);\n\
+\n\
+    __local float2 lcl_data[16];\n\
+    __local float2 lcl_sum[4];\n\
+\n\
+    for(; coord.x < group_stride; coord.x += 64)\n\
+    {\n\
+        mean_vari += read_imagef(input, coord);\n\
+    }\n\
+    lcl_data[lidx] = mean_vari.xy;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+    if(lidx < 4)\n\
+    {\n\
+        float2 tmpSum = (float2)(0);\n\
+        for(int i = lidx; i < 16; i+=4)\n\
+        {\n\
+            tmpSum += lcl_data[i];\n\
+        }\n\
+        lcl_sum[lidx] = tmpSum;\n\
+    }\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+    if(lidx == 0)\n\
+    {\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum_sqr += lcl_sum[i];\n\
+        }\n\
+        mean_vari.xy = sum_sqr * group_ratio;\n\
+        mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+        mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+\n\
+        coord.x = 0;\n\
+        write_imagef(output, coord, mean_vari);\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toU8(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
+    vxc_uchar16 src0, src2;\n\
+    vxc_short8 src1;\n\
+    vxc_half8 scale_h;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, scale_h, src1, 16);\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    short zp = inputZP;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
+    float alpha = scale_inOut * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
+    norm = tmpData0 * alpha + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = tmpData1 * alpha + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    norm = tmpData2 * alpha + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = tmpData3 * alpha + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toU8_2D(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int2 coord = (int2)(get_global_id(0), gidz);\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
+    vxc_uchar16 src0, src2;\n\
+    vxc_short8 src1;\n\
+    vxc_half8 scale_h;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, scale_h, src1, 16);\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    short zp = inputZP;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
+    float alpha = scale_inOut * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
+    norm = tmpData0 * alpha + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = tmpData1 * alpha + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    norm = tmpData2 * alpha + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = tmpData3 * alpha + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+}"; /* end of group_normalization_u8_vx*/
+
+static const char group_normalization_u8_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int width;\n\
+_viv_uniform int height;\n\
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\
+_viv_uniform float input_scale;\n\
+_viv_uniform int inputZP;\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toF16(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
+    vxc_uchar16 src0;\n\
+    vxc_short8 src1, outval;\n\
+    vxc_half8 scale_h, dst;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, scale_h, src1, 16);\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    short zp = inputZP;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    float alpha = input_scale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
+    norm = alpha * tmpData0 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData1 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    coord.x += 8;\n\
+    norm = alpha * tmpData2 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData3 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toF16_2D(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int2 coord = (int2)(get_global_id(0), gidz);\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
+    vxc_uchar16 src0;\n\
+    vxc_short8 src1, outval;\n\
+    vxc_half8 scale_h, dst;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    _viv_asm(COPY, scale_h, src1, 16);\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    short zp = inputZP;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    float alpha = input_scale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
+    norm = alpha * tmpData0 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData1 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    coord.x += 8;\n\
+    norm = alpha * tmpData2 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData3 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of group_normalization_u8_f16_vx*/
+
 static const char grucell_activation_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 #define logE     (1.44269502f)\n\
@@ -5930,10 +8481,7 @@ _viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\
 _viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;\n\
 \n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_F16(\n\
-    image2d_array_t input,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int rsFlg)\n\
+    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\
 {\n\
     int gidx = get_global_id(0) << 3;\n\
     int lidx = get_local_id(0);\n\
@@ -5946,13 +8494,18 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
 \n\
     __local float lcl_sum[16];\n\
     __local float lcl_sqr[16];\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
 \n\
     if(gidx < width)\n\
     {\n\
         for(coord.y = 0; coord.y < height;)\n\
         {\n\
-            VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
             coord.y++;\n\
             _viv_asm(COPY, in_h, src0, 16);\n\
             VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
@@ -5988,10 +8541,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
 }\n\
 \n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_F16_2D(\n\
-    image2d_array_t input,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int rsFlg)\n\
+    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\
 {\n\
     int gidx = get_global_id(0) << 3;\n\
     int lidx = get_local_id(0);\n\
@@ -6049,13 +8599,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
 }\n\
 \n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16toF16(\n\
-    image2d_array_t input,\n\
-    image2d_array_t bias,\n\
-    image2d_array_t scale,\n\
-    image2d_array_t meanVari,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int rsFlg)\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int rsFlg)\n\
 {\n\
     int gidz = get_global_id(1);\n\
     int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\
@@ -6072,12 +8617,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t
     VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
     bias_f = read_imagef(bias, coord_para);\n\
 \n\
-    coord_para.x = 0;\n\
-    coord_para.y = gidz;\n\
     for(int i = 0; i < group_num; i++)\n\
     {\n\
-        mean_vari += read_imagef(meanVari, coord_para);\n\
-        coord_para.x += 4;\n\
+        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
+        coord_para.y += 4;\n\
     }\n\
     mean_vari *= dimRatio;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
@@ -6089,11 +8632,20 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t
     half4 tmpVal0, tmpVal1;\n\
     bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
     vxc_half8 dst;\n\
+\n\
+    int8 input_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr);\n\
 \n\
     for(coord.y = 0; coord.y < height; coord.y++)\n\
     {\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
     _viv_asm(COPY, in_h, src0, 16);\n\
 \n\
     VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
@@ -6109,18 +8661,14 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
         uniConvertHalfToFp16_2x8);\n\
     _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \\\n\
+                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
     }\n\
 }\n\
 \n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16toF16_2D(\n\
-    image2d_array_t input,\n\
-    image2d_array_t bias,\n\
-    image2d_array_t scale,\n\
-    image2d_array_t meanVari,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int rsFlg)\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int rsFlg)\n\
 {\n\
     int gidz = get_global_id(1);\n\
     int gidy = gidz * height;\n\
@@ -6139,12 +8687,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16t
     VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
     bias_f = read_imagef(bias, coord_para);\n\
 \n\
-    coord_para.x = 0;\n\
-    coord_para.y = gidz;\n\
     for(int i = 0; i < group_num; i++)\n\
     {\n\
-        mean_vari += read_imagef(meanVari, coord_para);\n\
-        coord_para.x += 4;\n\
+        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
+        coord_para.y += 4;\n\
     }\n\
     mean_vari *= dimRatio;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
@@ -6216,12 +8762,16 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
     __local float lcl_sum[16];\n\
     __local float lcl_sqr[16];\n\
 \n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
     if(gidx < width)\n\
     {\n\
         for(coord.y = 0; coord.y < height;)\n\
         {\n\
-            VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
             coord.y++;\n\
             VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
                 uniInt16SumSqr_dp8x2);\n\
@@ -6325,7 +8875,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
     image2d_array_t input,\n\
     image2d_array_t bias,\n\
     image2d_array_t scale,\n\
-    image2d_array_t meanVari,\n\
+    image2d_t meanVari,\n\
     image2d_array_t output,\n\
               float eps,\n\
               int rsFlg)\n\
@@ -6347,12 +8897,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
 \n\
     bias_f = read_imagef(bias, coord_para);\n\
 \n\
-    coord_para.x = 0;\n\
-    coord_para.y = gidz;\n\
     for(int i = 0; i < group_num; i++)\n\
     {\n\
-        mean_vari += read_imagef(meanVari, coord_para);\n\
-        coord_para.x += 4;\n\
+        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
+        coord_para.y += 4;\n\
     }\n\
     mean_vari *= dimRatio;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
@@ -6365,12 +8913,20 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
     float alpha = input_fl_scale * scale_vari;\n\
     bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
     vxc_half8 dst;\n\
+\n\
+    int8 input_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr);\n\
 \n\
     for(coord.y = 0; coord.y < height; coord.y++)\n\
     {\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
     VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
         uniConvertInt16Fp32Fst_4x4);\n\
     VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
@@ -6384,7 +8940,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
         uniConvertHalfToFp16_2x8);\n\
     _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \\\n\
+                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
     }\n\
 }\n\
 \n\
@@ -6392,7 +8949,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
     image2d_array_t input,\n\
     image2d_array_t bias,\n\
     image2d_array_t scale,\n\
-    image2d_array_t meanVari,\n\
+    image2d_t meanVari,\n\
     image2d_array_t output,\n\
               float eps,\n\
               int rsFlg)\n\
@@ -6416,12 +8973,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
 \n\
     bias_f = read_imagef(bias, coord_para);\n\
 \n\
-    coord_para.x = 0;\n\
-    coord_para.y = gidz;\n\
     for(int i = 0; i < group_num; i++)\n\
     {\n\
-        mean_vari += read_imagef(meanVari, coord_para);\n\
-        coord_para.x += 4;\n\
+        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
+        coord_para.y += 4;\n\
     }\n\
     mean_vari *= dimRatio;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
@@ -6460,7 +9015,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
     image2d_array_t input,\n\
     image2d_array_t bias,\n\
     image2d_array_t scale,\n\
-    image2d_array_t meanVari,\n\
+    image2d_t meanVari,\n\
     image2d_array_t output,\n\
               float eps,\n\
               int rsFlg)\n\
@@ -6480,12 +9035,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
     VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
 \n\
     bias_f = read_imagef(bias, coord_para);\n\
-    coord_para.x = 0;\n\
-    coord_para.y = gidz;\n\
     for(int i = 0; i < group_num; i++)\n\
     {\n\
-        mean_vari += read_imagef(meanVari, coord_para);\n\
-        coord_para.x += 4;\n\
+        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
+        coord_para.y += 4;\n\
     }\n\
     mean_vari *= dimRatio;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
@@ -6497,10 +9050,18 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
     float alpha = inOut_fl_scale * scale_vari;\n\
     bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\
 \n\
+    int8 input_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr);\n\
     for(coord.y = 0; coord.y < height; coord.y++)\n\
     {\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
     VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
         uniConvertInt16Fp32Fst_4x4);\n\
     VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
@@ -6512,7 +9073,8 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
     tmpVal1 = convert_int4_rte(norm);\n\
     VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
         uniConvertInt32toInt16_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \\\n\
+                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
     }\n\
 }\n\
 \n\
@@ -6520,7 +9082,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
     image2d_array_t input,\n\
     image2d_array_t bias,\n\
     image2d_array_t scale,\n\
-    image2d_array_t meanVari,\n\
+    image2d_t meanVari,\n\
     image2d_array_t output,\n\
               float eps,\n\
               int rsFlg)\n\
@@ -6542,12 +9104,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16t
     VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
 \n\
     bias_f = read_imagef(bias, coord_para);\n\
-    coord_para.x = 0;\n\
-    coord_para.y = gidz;\n\
     for(int i = 0; i < group_num; i++)\n\
     {\n\
-        mean_vari += read_imagef(meanVari, coord_para);\n\
-        coord_para.x += 4;\n\
+        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
+        coord_para.y += 4;\n\
     }\n\
     mean_vari *= dimRatio;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
@@ -6602,10 +9162,7 @@ _viv_uniform float inOut_fl_scale;\n\
 _viv_uniform float output_fl_scale;\n\
 \n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I8(\n\
-    image2d_array_t input,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int rsFlg)\n\
+    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\
 {\n\
     int gidx = get_global_id(0) << 4;\n\
     int lidx = get_local_id(0);\n\
@@ -6613,18 +9170,22 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
     int4 coord = (int4)(gidx, 0, gidz, 0);\n\
     vxc_char16 src0;\n\
     float sum = 0, sqr = 0;\n\
-    int tmpSum = 0, tmpSqr = 0;\n\
-    int tmpSum1, tmpSqr1;\n\
+    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\
 \n\
     __local float lcl_sum[16];\n\
     __local float lcl_sqr[16];\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
 \n\
     if(gidx < width)\n\
     {\n\
         for(coord.y = 0; coord.y < height;)\n\
         {\n\
-            VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
             coord.y++;\n\
             VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumInt8_16x1);\n\
             tmpSum += (tmpSum1);\n\
@@ -6634,7 +9195,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
         sqr = tmpSqr * inFlScale_s2;\n\
         sum = tmpSum * input_fl_scale;\n\
     }\n\
-\n\
     lcl_sum[lidx] = sum;\n\
     lcl_sqr[lidx] = sqr;\n\
     barrier(CLK_LOCAL_MEM_FENCE);\n\
@@ -6649,8 +9209,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
         sum = 0; sqr = 0;\n\
         for(int i = 0; i < 4; i++)\n\
         {\n\
-            //sum += lcl_sum[i];\n\
-            //sqr += lcl_sqr[i];\n\
             sum += dot(tmp_sum[i], one);\n\
             sqr += dot(tmp_sqr[i], one);\n\
         }\n\
@@ -6661,10 +9219,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
 }\n\
 \n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_I8_2D(\n\
-    image2d_array_t input,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int rsFlg)\n\
+    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\
 {\n\
     int gidx = get_global_id(0) << 4;\n\
     int lidx = get_local_id(0);\n\
@@ -6674,8 +9229,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
     int2 coord = (int2)(gidx, gidy);\n\
     vxc_char16 src0;\n\
     float sum = 0, sqr = 0;\n\
-    int tmpSum = 0, tmpSqr = 0;\n\
-    int tmpSum1, tmpSqr1;\n\
+    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\
 \n\
     __local float lcl_sum[16];\n\
     __local float lcl_sqr[16];\n\
@@ -6683,7 +9237,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
     int endH = gidy + height;\n\
     if(gidx < width)\n\
     {\n\
-        tmpSqr = 0;\n\
         for(; coord.y < endH;)\n\
         {\n\
             VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
@@ -6712,8 +9265,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
         sum = 0; sqr = 0;\n\
         for(int i = 0; i < 4; i++)\n\
         {\n\
-            //sum += lcl_sum[i];\n\
-            //sqr += lcl_sqr[i];\n\
             sum += dot(tmp_sum[i], one);\n\
             sqr += dot(tmp_sqr[i], one);\n\
         }\n\
@@ -6724,94 +9275,81 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
 }\n\
 \n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toF16(\n\
-    image2d_array_t input,\n\
-    image2d_array_t bias,\n\
-    image2d_array_t scale,\n\
-    image2d_array_t meanVari,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int rsFlg)\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int rsFlg)\n\
 {\n\
     int gidz = get_global_id(1);\n\
     int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\
     int4 coord_para = (int4)(gidz, 0, 0, 0);\n\
     vxc_char16 src0;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
+    vxc_short8 src1, outval;\n\
+    vxc_half8 scale_h, dst;\n\
     float scale_vari, bias_val;\n\
     vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\
 \n\
     VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
         VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
     _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        UniFP16toFP32Lo4_dp4x4);\n\
-\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
     bias_f = read_imagef(bias, coord_para);\n\
 \n\
-    coord_para.x = 0;\n\
-    coord_para.y = gidz;\n\
     for(int i = 0; i < group_num; i++)\n\
     {\n\
-        mean_vari += read_imagef(meanVari, coord_para);\n\
-        coord_para.x += 4;\n\
+        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
+        coord_para.y += 4;\n\
     }\n\
     mean_vari *= dimRatio;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
     mean_vari.s1 = rsqrt(mean_vari.s1);\n\
 \n\
     scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;\n\
-    vxc_short8 outval;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
     half4 tmpVal0, tmpVal1;\n\
     float alpha = input_fl_scale * scale_vari;\n\
     bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-    vxc_half8 dst;\n\
+\n\
+    coord_para = coord;\n\
+    int8 input_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_para.z, baseAddr);\n\
 \n\
     for(coord.y = 0; coord.y < height;)\n\
     {\n\
-    coord_para = coord;\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    coord_para.xy = coord.xy;\n\
     coord.y++;\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertDirInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertEndInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertTrdInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertFthInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
 \n\
-    vxc_float4 norm;\n\
     norm = alpha * tmpData0 + bias_val;\n\
     _viv_asm(CONV, tmpVal0, norm);\n\
     norm = alpha * tmpData1 + bias_val;\n\
     _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertHalfToFp16_2x8);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
     _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
     coord_para.x += 8;\n\
     norm = alpha * tmpData2 + bias_val;\n\
     _viv_asm(CONV, tmpVal0, norm);\n\
     norm = alpha * tmpData3 + bias_val;\n\
     _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertHalfToFp16_2x8);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
     _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
     }\n\
 }\n\
 \n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toF16_2D(\n\
-    image2d_array_t input,\n\
-    image2d_array_t bias,\n\
-    image2d_array_t scale,\n\
-    image2d_array_t meanVari,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int rsFlg)\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int rsFlg)\n\
 {\n\
     int gidz = get_global_id(1);\n\
     int gidy = gidz * height;\n\
@@ -6819,59 +9357,48 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to
     int4 coord_para = (int4)(gidz, 0, 0, 0);\n\
     int endH = gidy + height;\n\
     vxc_char16 src0;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
+    vxc_short8 src1, outval;\n\
+    vxc_half8 scale_h, dst;\n\
     float scale_vari, bias_val;\n\
     vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\
 \n\
     VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
         VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
     _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        UniFP16toFP32Lo4_dp4x4);\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
 \n\
     bias_f = read_imagef(bias, coord_para);\n\
 \n\
-    coord_para.x = 0;\n\
-    coord_para.y = gidz;\n\
     for(int i = 0; i < group_num; i++)\n\
     {\n\
-        mean_vari += read_imagef(meanVari, coord_para);\n\
-        coord_para.x += 4;\n\
+        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
+        coord_para.y += 4;\n\
     }\n\
     mean_vari *= dimRatio;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
     mean_vari.s1 = rsqrt(mean_vari.s1);\n\
 \n\
     scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;\n\
-    vxc_short8 outval;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
     half4 tmpVal0, tmpVal1;\n\
     float alpha = input_fl_scale * scale_vari;\n\
     bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-    vxc_half8 dst;\n\
 \n\
     for(; coord.y < endH;)\n\
     {\n\
-    coord_para = coord;\n\
     VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
         VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    coord_para = coord;\n\
     coord.y++;\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertDirInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertEndInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertTrdInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertFthInt8Fp32_4x4);\n\
-    vxc_float4 norm;\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
     norm = alpha * tmpData0 + bias_val;\n\
     _viv_asm(CONV, tmpVal0, norm);\n\
     norm = alpha * tmpData1 + bias_val;\n\
     _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertHalfToFp16_2x8);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
     _viv_asm(COPY, outval, dst, 16);\n\
     VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
     coord_para.x += 8;\n\
@@ -6879,21 +9406,15 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to
     _viv_asm(CONV, tmpVal0, norm);\n\
     norm = alpha * tmpData3 + bias_val;\n\
     _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertHalfToFp16_2x8);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
     _viv_asm(COPY, outval, dst, 16);\n\
     VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
     }\n\
 }\n\
 \n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toI8(\n\
-    image2d_array_t input,\n\
-    image2d_array_t bias,\n\
-    image2d_array_t scale,\n\
-    image2d_array_t meanVari,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int rsFlg)\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int rsFlg)\n\
 {\n\
     int gidz = get_global_id(1);\n\
     int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\
@@ -6910,12 +9431,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to
     VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
 \n\
     bias_f = read_imagef(bias, coord_para);\n\
-    coord_para.x = 0;\n\
-    coord_para.y = gidz;\n\
     for(int i = 0; i < group_num; i++)\n\
     {\n\
-        mean_vari += read_imagef(meanVari, coord_para);\n\
-        coord_para.x += 4;\n\
+        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
+        coord_para.y += 4;\n\
     }\n\
     mean_vari *= dimRatio;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
@@ -6923,47 +9442,44 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to
 \n\
     scale_vari = scale_f.s0 * mean_vari.s1;\n\
     vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
     float alpha = inOut_fl_scale * scale_vari;\n\
     bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\
+\n\
+    int8 input_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr);\n\
 \n\
     for(coord.y = 0; coord.y < height; coord.y++)\n\
     {\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertDirInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertEndInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertTrdInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertFthInt8Fp32_4x4);\n\
-    vxc_float4 norm;\n\
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
     norm = tmpData0 * alpha + bias_val;\n\
     tmpVal0 = convert_int4_rte(norm);\n\
     norm = tmpData1 * alpha + bias_val;\n\
     tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-        uniConvertInt32toUint8_2x8);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
     norm = tmpData2 * alpha + bias_val;\n\
     tmpVal0 = convert_int4_rte(norm);\n\
     norm = tmpData3 * alpha + bias_val;\n\
     tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\
-        uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
     }\n\
 }\n\
 \n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8toI8_2D(\n\
-    image2d_array_t input,\n\
-    image2d_array_t bias,\n\
-    image2d_array_t scale,\n\
-    image2d_array_t meanVari,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int rsFlg)\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int rsFlg)\n\
 {\n\
     int gidz = get_global_id(1);\n\
     int gidy = gidz * height;\n\
@@ -6982,12 +9498,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to
     VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
 \n\
     bias_f = read_imagef(bias, coord_para);\n\
-    coord_para.x = 0;\n\
-    coord_para.y = gidz;\n\
     for(int i = 0; i < group_num; i++)\n\
     {\n\
-        mean_vari += read_imagef(meanVari, coord_para);\n\
-        coord_para.x += 4;\n\
+        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
+        coord_para.y += 4;\n\
     }\n\
     mean_vari *= dimRatio;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
@@ -6995,39 +9509,715 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I8to
 \n\
     scale_vari = scale_f.s0 * mean_vari.s1;\n\
     vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
     float alpha = inOut_fl_scale * scale_vari;\n\
     bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\
 \n\
     for(; coord.y < endH; coord.y++)\n\
     {\n\
-    VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
+    norm = tmpData0 * alpha + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = tmpData1 * alpha + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    norm = tmpData2 * alpha + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = tmpData3 * alpha + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+"; /* end of instance_normalization_i8_vx*/
+
+static const char instance_normalization_scale_f32_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int height;\n\
+_viv_uniform float dimRatio;\n\
+_viv_uniform int group_num;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\
+_viv_uniform int inputZP;\n\
+_viv_uniform float scale_inOut;\n\
+_viv_uniform float outputScale;\n\
+_viv_uniform int output_ZP;\n\
+_viv_uniform float inOut_fl_scale;\n\
+_viv_uniform float output_fl_scale;\n\
+_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8;\n\
+\n\
+#define INSTANCENORM_8BITS_F32(src1_type_name, read_type) \\\n\
+__kernel void instance_norm_##src1_type_name##F32to##src1_type_name( \\\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, \\\n\
+    image2d_array_t output, float eps, int rsFlg) \\\n\
+{ \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0); \\\n\
+    int2 coord_para = (int2)(gidz, 0); \\\n\
+    read_type src0, src2; \\\n\
+    float scale_vari, bias_val; \\\n\
+    vxc_float4 mean_vari = (vxc_float4)(0); \\\n\
+ \\\n\
+    Image img1 = create_image_from_image2d(bias, 4); \\\n\
+    Image img2 = create_image_from_image2d(scale, 4); \\\n\
+    Image img3 = create_image_from_image2d(meanVari, 4); \\\n\
+    __global float* bias_ptr = (__global float*)img1.ptr; \\\n\
+    __global float* scal_ptr = (__global float*)img2.ptr; \\\n\
+    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); \\\n\
+    __global float4* vari_ptr = (__global float4*)sumVari_ptr; \\\n\
+ \\\n\
+    float bval = bias_ptr[gidz]; \\\n\
+    float sval = scal_ptr[gidz]; \\\n\
+ \\\n\
+    for(int i = 0; i < group_num; i++) \\\n\
+    { \\\n\
+        mean_vari += vari_ptr[i]; \\\n\
+    } \\\n\
+    mean_vari *= dimRatio; \\\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\
+ \\\n\
+    scale_vari = sval * mean_vari.s1; \\\n\
+    short zp = inputZP; \\\n\
+    vxc_int4 tmpVal0, tmpVal1; \\\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\
+    float alpha = scale_inOut * scale_vari; \\\n\
+    bias_val = (bval - scale_vari * mean_vari.s0) * outputScale + output_ZP; \\\n\
+ \\\n\
+    int8 input_desc, output_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord.z, baseAddr_a); \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord.w, baseAddr); \\\n\
+ \\\n\
+    for(coord.y = 0; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+             uniConvert1stUint8SubZpToFp32_4x4); \\\n\
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+             uniConvert2ndUint8SubZpToFp32_4x4); \\\n\
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+             uniConvert3rdUint8SubZpToFp32_4x4); \\\n\
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+             uniConvert4thUint8SubZpToFp32_4x4); \\\n\
+    norm = tmpData0 * alpha + bias_val; \\\n\
+    tmpVal0 = convert_int4_rte(norm); \\\n\
+    norm = tmpData1 * alpha + bias_val; \\\n\
+    tmpVal1 = convert_int4_rte(norm); \\\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\
+    norm = tmpData2 * alpha + bias_val; \\\n\
+    tmpVal0 = convert_int4_rte(norm); \\\n\
+    norm = tmpData3 * alpha + bias_val; \\\n\
+    tmpVal1 = convert_int4_rte(norm); \\\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+INSTANCENORM_8BITS_F32(U8, vxc_uchar16)\n\
+INSTANCENORM_8BITS_F32(I8, vxc_char16)\n\
+\n\
+#define INSTANCENORM_8BITS_F32_2D(src1_type_name, read_type) \\\n\
+__kernel void instance_norm_##src1_type_name##F32to##src1_type_name##_2D( \\\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, \\\n\
+    image2d_array_t output, float eps, int rsFlg) \\\n\
+{ \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int gidy = gidz * height; \\\n\
+    int2 coord = (int2)(get_global_id(0), gidy); \\\n\
+    int2 coord_para = (int2)(gidz, 0); \\\n\
+    int endH = gidy + height; \\\n\
+    read_type src0, src2; \\\n\
+    float scale_vari, bias_val; \\\n\
+    vxc_float4 mean_vari = (vxc_float4)(0); \\\n\
+ \\\n\
+    Image img1 = create_image_from_image2d(bias, 4); \\\n\
+    Image img2 = create_image_from_image2d(scale, 4); \\\n\
+    Image img3 = create_image_from_image2d(meanVari, 4); \\\n\
+    __global float* bias_ptr = (__global float*)img1.ptr; \\\n\
+    __global float* scal_ptr = (__global float*)img2.ptr; \\\n\
+    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx); \\\n\
+    __global float4* vari_ptr = (__global float4*)sumVari_ptr; \\\n\
+ \\\n\
+    float bval = bias_ptr[gidz]; \\\n\
+    float sval = scal_ptr[gidz]; \\\n\
+ \\\n\
+    for(int i = 0; i < group_num; i++) \\\n\
+    { \\\n\
+        mean_vari += vari_ptr[i]; \\\n\
+    } \\\n\
+ \\\n\
+    mean_vari *= dimRatio; \\\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\
+ \\\n\
+    scale_vari = sval * mean_vari.s1; \\\n\
+    short zp = inputZP; \\\n\
+    vxc_int4 tmpVal0, tmpVal1; \\\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\
+    float alpha = scale_inOut * scale_vari; \\\n\
+    bias_val = (bval - scale_vari * mean_vari.s0) * outputScale + output_ZP; \\\n\
+ \\\n\
+    for(; coord.y < endH; coord.y++) \\\n\
+    { \\\n\
+    VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+             uniConvert1stUint8SubZpToFp32_4x4); \\\n\
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+             uniConvert2ndUint8SubZpToFp32_4x4); \\\n\
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+             uniConvert3rdUint8SubZpToFp32_4x4); \\\n\
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+             uniConvert4thUint8SubZpToFp32_4x4); \\\n\
+    norm = tmpData0 * alpha + bias_val; \\\n\
+    tmpVal0 = convert_int4_rte(norm); \\\n\
+    norm = tmpData1 * alpha + bias_val; \\\n\
+    tmpVal1 = convert_int4_rte(norm); \\\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\
+    norm = tmpData2 * alpha + bias_val; \\\n\
+    tmpVal0 = convert_int4_rte(norm); \\\n\
+    norm = tmpData3 * alpha + bias_val; \\\n\
+    tmpVal1 = convert_int4_rte(norm); \\\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\
+    VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+INSTANCENORM_8BITS_F32_2D(U8, vxc_uchar16)\n\
+INSTANCENORM_8BITS_F32_2D(I8, vxc_char16)\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F32toI16(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int rsFlg)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\
+    int2 coord_para = (int2)(gidz, 0);\n\
+    vxc_short8 src0, src2;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4  mean_vari = (vxc_float4)(0);\n\
+\n\
+    Image img1 = create_image_from_image2d(bias, 4);\n\
+    Image img2 = create_image_from_image2d(scale, 4);\n\
+    Image img3 = create_image_from_image2d(meanVari, 4);\n\
+    __global float* bias_ptr = (__global float*)img1.ptr;\n\
+    __global float* scal_ptr = (__global float*)img2.ptr;\n\
+    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);\n\
+    __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\
+\n\
+    float bval = bias_ptr[gidz];\n\
+    float sval = scal_ptr[gidz];\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += vari_ptr[i];\n\
+    }\n\
+\n\
+    mean_vari *= dimRatio;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+\n\
+    scale_vari = sval * mean_vari.s1;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    float alpha = inOut_fl_scale * scale_vari;\n\
+    bias_val = (bval - scale_vari * mean_vari.s0) * output_fl_scale;\n\
+\n\
+    int8 input_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr);\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
     VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertDirInt8Fp32_4x4);\n\
+        uniConvertInt16Fp32Fst_4x4);\n\
     VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertEndInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertTrdInt8Fp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertFthInt8Fp32_4x4);\n\
+        uniConvertInt16Fp32Secd_4x4);\n\
     vxc_float4 norm;\n\
     norm = tmpData0 * alpha + bias_val;\n\
     tmpVal0 = convert_int4_rte(norm);\n\
     norm = tmpData1 * alpha + bias_val;\n\
     tmpVal1 = convert_int4_rte(norm);\n\
     VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-        uniConvertInt32toUint8_2x8);\n\
-    norm = tmpData2 * alpha + bias_val;\n\
-    tmpVal0 = convert_int4_rte(norm);\n\
-    norm = tmpData3 * alpha + bias_val;\n\
-    tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\
-        uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        uniConvertInt32toInt16_2x8);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \\\n\
+                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
     }\n\
 }\n\
-"; /* end of instance_normalization_i8_vx*/
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_I16F32toI16_2D(\n\
+    image2d_t input, image2d_t bias, image2d_t scale,\n\
+    image2d_t meanVari, image2d_array_t output, float eps, int rsFlg)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int gidy = gidz * height;\n\
+    int2 coord = (int2)(get_global_id(0), gidy);\n\
+    int2 coord_para = (int2)(gidz, 0);\n\
+    int endH = gidy + height;\n\
+    vxc_short8 src0, src2;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+\n\
+    Image img1 = create_image_from_image2d(bias, 4);\n\
+    Image img2 = create_image_from_image2d(scale, 4);\n\
+    Image img3 = create_image_from_image2d(meanVari, 4);\n\
+    __global float* bias_ptr = (__global float*)img1.ptr;\n\
+    __global float* scal_ptr = (__global float*)img2.ptr;\n\
+    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);\n\
+    __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\
+\n\
+    float bval = bias_ptr[gidz];\n\
+    float sval = scal_ptr[gidz];\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += vari_ptr[i];\n\
+    }\n\
+\n\
+    mean_vari *= dimRatio;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+\n\
+    scale_vari = sval * mean_vari.s1;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    float alpha = inOut_fl_scale * scale_vari;\n\
+    bias_val = (bval - scale_vari * mean_vari.s0) * output_fl_scale;\n\
+\n\
+    for(; coord.y < endH; coord.y++)\n\
+    {\n\
+    VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertInt16Fp32Fst_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertInt16Fp32Secd_4x4);\n\
+    vxc_float4 norm;\n\
+    norm = tmpData0 * alpha + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = tmpData1 * alpha + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+        uniConvertInt32toInt16_2x8);\n\
+    VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}"; /* end of instance_normalization_scale_f32_vx*/
+
+static const char instance_normalization_scale_f32_bf16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int width;\n\
+_viv_uniform int height;\n\
+_viv_uniform float dimRatio;\n\
+_viv_uniform int group_num;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
+\n\
+constant vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+constant float4 one = (float4)(1.0, 1.0, 1.0, 1.0);\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_BF16(\n\
+    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\
+{\n\
+    int gidx = get_global_id(0) << 3;\n\
+    int lidx = get_local_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
+    vxc_short8 src0, src1, src2;\n\
+    float4 srcA, srcB;\n\
+    vxc_float sum = 0, sqr = 0;\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+\n\
+    if(gidx < width)\n\
+    {\n\
+        for(coord.y = 0; coord.y < height;)\n\
+        {\n\
+            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+            VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                         uniConvBF16toF32_Part0_2x8);\n\
+            VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                         uniConvBF16toF32_Part1_2x8);\n\
+            _viv_asm(COPY, srcA, src1, 16);\n\
+            _viv_asm(COPY, srcB, src2, 16);\n\
+            sum += dot(srcA, one) + dot(srcB, one);\n\
+            sqr += dot(srcA * srcA, one) + dot(srcB * srcB, one);\n\
+        }\n\
+    }\n\
+\n\
+    lcl_sum[lidx] = sum;\n\
+    lcl_sqr[lidx] = sqr;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
+    if(lidx == 0)\n\
+    {\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+\n\
+        sum = 0;\n\
+        sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+\n\
+        float4 data = (float4)(sum, sqr, 0, 0);\n\
+        write_imagef(output, coord_out, data);\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_BF16_2D(\n\
+    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\
+{\n\
+    int gidx = get_global_id(0) << 3;\n\
+    int lidx = get_local_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int gidy = gidz * height;\n\
+\n\
+    int2 coord = (int2)(gidx, gidy);\n\
+    vxc_short8 src0, src1, src2;\n\
+    float4 srcA, srcB;\n\
+    vxc_float sum = 0, sqr = 0;\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    int endH = gidy + height;\n\
+    if(gidx < width)\n\
+    {\n\
+        for(; coord.y < endH;)\n\
+        {\n\
+            VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+            VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                         uniConvBF16toF32_Part0_2x8);\n\
+            VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                         uniConvBF16toF32_Part1_2x8);\n\
+            _viv_asm(COPY, srcA, src1, 16);\n\
+            _viv_asm(COPY, srcB, src2, 16);\n\
+            sum += dot(srcA, one) + dot(srcB, one);\n\
+            sqr += dot(srcA * srcA, one) + dot(srcB * srcB, one);\n\
+        }\n\
+    }\n\
+\n\
+    lcl_sum[lidx] = sum;\n\
+    lcl_sqr[lidx] = sqr;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
+    if(lidx == 0)\n\
+    {\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+\n\
+        sum = 0;\n\
+        sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+\n\
+        float4 data = (float4)(sum, sqr, 0, 0);\n\
+        write_imagef(output, coord_out, data);\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16F32toBF16(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int rsFlg)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\
+    vxc_short8 src0, src1, src2;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+\n\
+    Image img1 = create_image_from_image2d(bias, 4);\n\
+    Image img2 = create_image_from_image2d(scale, 4);\n\
+    Image img3 = create_image_from_image2d(meanVari, 4);\n\
+    __global float* bias_ptr = (__global float*)img1.ptr;\n\
+    __global float* scal_ptr = (__global float*)img2.ptr;\n\
+    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord.wz);\n\
+    __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\
+\n\
+    float bval = bias_ptr[gidz];\n\
+    float sval = scal_ptr[gidz];\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += vari_ptr[i];\n\
+    }\n\
+\n\
+    mean_vari *= dimRatio;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+\n\
+    scale_vari = sval * mean_vari.s1;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    bias_val = (bval - scale_vari * mean_vari.s0);\n\
+\n\
+    int8 input_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr);\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvBF16toF32_Part0_2x8);\n\
+    VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvBF16toF32_Part1_2x8);\n\
+    _viv_asm(COPY, tmpData0, src1, 16);\n\
+    _viv_asm(COPY, tmpData1, src2, 16);\n\
+\n\
+    vxc_float4 norm;\n\
+    norm = scale_vari * tmpData0 + bias_val;\n\
+    _viv_asm(COPY, src0, norm, 16);\n\
+    norm = scale_vari * tmpData1 + bias_val;\n\
+    _viv_asm(COPY, src1, norm, 16);\n\
+    VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, \\\n\
+                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16F32toBF16_2D(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int rsFlg)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int gidy = gidz * height;\n\
+    int2 coord = (int2)(get_global_id(0), gidy);\n\
+    int2 coord_para = (int2)(gidz, 0);\n\
+    int endH = gidy + height;\n\
+    vxc_short8 src0, src1, src2;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+\n\
+    Image img1 = create_image_from_image2d(bias, 4);\n\
+    Image img2 = create_image_from_image2d(scale, 4);\n\
+    Image img3 = create_image_from_image2d(meanVari, 4);\n\
+    __global float* bias_ptr = (__global float*)img1.ptr;\n\
+    __global float* scal_ptr = (__global float*)img2.ptr;\n\
+    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);\n\
+    __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\
+\n\
+    float bval = bias_ptr[gidz];\n\
+    float sval = scal_ptr[gidz];\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += vari_ptr[i];\n\
+    }\n\
+\n\
+    mean_vari *= dimRatio;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+\n\
+    scale_vari = sval * mean_vari.s1;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    bias_val = (bval - scale_vari * mean_vari.s0);\n\
+\n\
+    for(; coord.y < endH; coord.y++)\n\
+    {\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvBF16toF32_Part0_2x8);\n\
+    VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvBF16toF32_Part1_2x8);\n\
+    _viv_asm(COPY, tmpData0, src1, 16);\n\
+    _viv_asm(COPY, tmpData1, src2, 16);\n\
+\n\
+    vxc_float4 norm;\n\
+    norm = scale_vari * tmpData0 + bias_val;\n\
+    _viv_asm(COPY, src0, norm, 16);\n\
+    norm = scale_vari * tmpData1 + bias_val;\n\
+    _viv_asm(COPY, src1, norm, 16);\n\
+    VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}"; /* end of instance_normalization_scale_f32_bf16_vx*/
+
+static const char instance_normalization_scale_f32_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int height;\n\
+_viv_uniform float dimRatio;\n\
+_viv_uniform int group_num;\n\
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F32toF16(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int rsFlg)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\
+    vxc_short8 src0;\n\
+    vxc_half8  in_h;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+\n\
+    Image img1 = create_image_from_image2d(bias, 4);\n\
+    Image img2 = create_image_from_image2d(scale, 4);\n\
+    Image img3 = create_image_from_image2d(meanVari, 4);\n\
+    __global float* bias_ptr = (__global float*)img1.ptr;\n\
+    __global float* scal_ptr = (__global float*)img2.ptr;\n\
+    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord.wz);\n\
+    __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\
+\n\
+    float bval = bias_ptr[gidz];\n\
+    float sval = scal_ptr[gidz];\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += vari_ptr[i];\n\
+    }\n\
+\n\
+    mean_vari *= dimRatio;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+\n\
+    scale_vari = sval * mean_vari.s1;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    vxc_short8 outval;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    bias_val = (bval - scale_vari * mean_vari.s0);\n\
+    vxc_half8 dst;\n\
+\n\
+    int8 input_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr);\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, in_h, src0, 16);\n\
+\n\
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        UniFP16toFP32Lo4_dp4x4);\n\
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertEndInt16Fp32_4x4);\n\
+\n\
+    vxc_float4 norm;\n\
+    norm = scale_vari * tmpData0 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = scale_vari * tmpData1 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, outval, \\\n\
+                    VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_F16F32toF16_2D(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int rsFlg)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int gidy = gidz * height;\n\
+    int2 coord = (int2)(get_global_id(0), gidy);\n\
+    int2 coord_para = (int2)(gidz, 0);\n\
+    int endH = gidy + height;\n\
+    vxc_short8 src0;\n\
+    vxc_half8  in_h;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+\n\
+    Image img1 = create_image_from_image2d(bias, 4);\n\
+    Image img2 = create_image_from_image2d(scale, 4);\n\
+    Image img3 = create_image_from_image2d(meanVari, 4);\n\
+    __global float* bias_ptr = (__global float*)img1.ptr;\n\
+    __global float* scal_ptr = (__global float*)img2.ptr;\n\
+    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);\n\
+    __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\
+\n\
+    float bval = bias_ptr[gidz];\n\
+    float sval = scal_ptr[gidz];\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += vari_ptr[i];\n\
+    }\n\
+\n\
+    mean_vari *= dimRatio;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+\n\
+    scale_vari = sval * mean_vari.s1;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    vxc_short8 outval;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    bias_val = (bval - scale_vari * mean_vari.s0);\n\
+    vxc_half8 dst;\n\
+\n\
+    for(; coord.y < endH; coord.y++)\n\
+    {\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, in_h, src0, 16);\n\
+\n\
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        UniFP16toFP32Lo4_dp4x4);\n\
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertEndInt16Fp32_4x4);\n\
+    vxc_float4 norm;\n\
+    norm = scale_vari * tmpData0 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = scale_vari * tmpData1 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}"; /* end of instance_normalization_scale_f32_f16_vx*/
 
 static const char instance_normalization_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -7036,7 +10226,6 @@ _viv_uniform int height;\n\
 _viv_uniform float dimRatio;\n\
 _viv_uniform int group_num;\n\
 _viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
-_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
 _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
 \n\
 _viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
@@ -7056,9 +10245,7 @@ _viv_uniform float outputScale;\n\
 _viv_uniform int output_ZP;\n\
 \n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_U8(\n\
-    image2d_array_t input,\n\
-    image2d_array_t output,\n\
-        float eps, int rsFlg)\n\
+    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\
 {\n\
     int gidx = get_global_id(0) << 4;\n\
     int lidx = get_local_id(0);\n\
@@ -7066,17 +10253,20 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
     int4 coord = (int4)(gidx, 0, gidz, 0);\n\
     vxc_uchar16 src0;\n\
     float sum = 0, sqr = 0;\n\
-    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\
+    int tmpSum = 0, tmpSqr = 0, tmpSum1 = 0, tmpSqr1 = 0;\n\
 \n\
     __local float lcl_sum[16];\n\
     __local float lcl_sqr[16];\n\
-\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
     if(gidx < width)\n\
     {\n\
         for(coord.y = 0; coord.y < height;)\n\
         {\n\
-            VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
             coord.y++;\n\
             VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
             tmpSum += (tmpSum1);\n\
@@ -7086,7 +10276,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
         sqr += (tmpSqr * e2InScale + rowSumScale);\n\
         sum = (tmpSum + sumInZp) * input_scale;\n\
     }\n\
-\n\
     lcl_sum[lidx] = sum;\n\
     lcl_sqr[lidx] = sqr;\n\
     barrier(CLK_LOCAL_MEM_FENCE);\n\
@@ -7097,23 +10286,19 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
         float4 one = (float4)(1, 1, 1, 1);\n\
         __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
         __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-\n\
         sum = 0; sqr = 0;\n\
         for(int i = 0; i < 4; i++)\n\
         {\n\
             sum += dot(tmp_sum[i], one);\n\
             sqr += dot(tmp_sqr[i], one);\n\
         }\n\
-\n\
         float4 data = (float4)(sum, sqr, 0, 0);\n\
         write_imagef(output, coord_out, data);\n\
     }\n\
 }\n\
 \n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_meanvari_U8_2D(\n\
-    image2d_array_t input,\n\
-    image2d_array_t output,\n\
-              float eps, int rsFlg)\n\
+    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\
 {\n\
     int gidx = get_global_id(0) << 4;\n\
     int lidx = get_local_id(0);\n\
@@ -7124,17 +10309,16 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
     vxc_uchar16 src0;\n\
     float sum = 0, sqr = 0;\n\
     int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\
+    int endH = gidy + height;\n\
 \n\
     __local float lcl_sum[16];\n\
     __local float lcl_sqr[16];\n\
-\n\
-    int endH = gidy + height;\n\
     if(gidx < width)\n\
     {\n\
         for(; coord.y < endH;)\n\
         {\n\
             VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\n\
-                VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+                             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
             coord.y++;\n\
             VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
             tmpSum += (tmpSum1);\n\
@@ -7144,7 +10328,6 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
         sqr += (tmpSqr * e2InScale + rowSumScale);\n\
         sum = (tmpSum + sumInZp) * input_scale;\n\
     }\n\
-\n\
     lcl_sum[lidx] = sum;\n\
     lcl_sqr[lidx] = sqr;\n\
     barrier(CLK_LOCAL_MEM_FENCE);\n\
@@ -7155,192 +10338,20 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_mean
         float4 one = (float4)(1, 1, 1, 1);\n\
         __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
         __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
-\n\
         sum = 0; sqr = 0;\n\
         for(int i = 0; i < 4; i++)\n\
         {\n\
             sum += dot(tmp_sum[i], one);\n\
             sqr += dot(tmp_sqr[i], one);\n\
         }\n\
-\n\
         float4 data = (float4)(sum, sqr, 0, 0);\n\
         write_imagef(output, coord_out, data);\n\
     }\n\
 }\n\
 \n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16(\n\
-    image2d_array_t input,\n\
-    image2d_array_t bias,\n\
-    image2d_array_t scale,\n\
-    image2d_array_t meanVari,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int rsFlg)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\
-    int4 coord_para = (int4)(gidz, 0, 0, 0);\n\
-    vxc_uchar16 src0;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\
-\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    bias_f = read_imagef(bias, coord_para);\n\
-\n\
-    coord_para.x = 0;\n\
-    coord_para.y = gidz;\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_para);\n\
-        coord_para.x += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    short zp = inputZP;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;\n\
-    vxc_short8 outval;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    float alpha = input_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-    vxc_half8 dst;\n\
-\n\
-    for(coord.y = 0; coord.y < height;)\n\
-    {\n\
-    coord_para = coord;\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    coord.y++;\n\
-\n\
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvert1stUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvert2ndUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvert3rdUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvert4thUint8SubZpToFp32_4x4);\n\
-    vxc_float4 norm;\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    coord_para.x += 8;\n\
-    norm = alpha * tmpData2 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData3 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage2DArray(output, coord_para, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16_2D(\n\
-    image2d_array_t input,\n\
-    image2d_array_t bias,\n\
-    image2d_array_t scale,\n\
-    image2d_array_t meanVari,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int rsFlg)\n\
-{\n\
-    int gidz = get_global_id(1);\n\
-    int gidy = gidz * height;\n\
-    int4 coord = (int4)(get_global_id(0), gidy, 0, 0);\n\
-    int4 coord_para = (int4)(gidz, 0, 0, 0);\n\
-    int endH = gidy + height;\n\
-    vxc_uchar16 src0;\n\
-    vxc_short8 src1;\n\
-    vxc_half8 scale_h;\n\
-    float scale_vari, bias_val;\n\
-    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\
-\n\
-    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, scale_h, src1, 16);\n\
-    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        UniFP16toFP32Lo4_dp4x4);\n\
-\n\
-    bias_f = read_imagef(bias, coord_para);\n\
-\n\
-    coord_para.x = 0;\n\
-    coord_para.y = gidz;\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += read_imagef(meanVari, coord_para);\n\
-        coord_para.x += 4;\n\
-    }\n\
-    mean_vari *= dimRatio;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    scale_vari = scale_f.s0 * mean_vari.s1;\n\
-    short zp = inputZP;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;\n\
-    vxc_short8 outval;\n\
-    half4 tmpVal0, tmpVal1;\n\
-    float alpha = input_scale * scale_vari;\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
-    vxc_half8 dst;\n\
-\n\
-    for(; coord.y < endH;)\n\
-    {\n\
-    coord_para = coord;\n\
-    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    coord.y++;\n\
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvert1stUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvert2ndUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvert3rdUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvert4thUint8SubZpToFp32_4x4);\n\
-    vxc_float4 norm;\n\
-    norm = alpha * tmpData0 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData1 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    coord_para.x += 8;\n\
-    norm = alpha * tmpData2 + bias_val;\n\
-    _viv_asm(CONV, tmpVal0, norm);\n\
-    norm = alpha * tmpData3 + bias_val;\n\
-    _viv_asm(CONV, tmpVal1, norm);\n\
-    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvertHalfToFp16_2x8);\n\
-    _viv_asm(COPY, outval, dst, 16);\n\
-    VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
-}\n\
-\n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toU8(\n\
-    image2d_array_t input,\n\
-    image2d_array_t bias,\n\
-    image2d_array_t scale,\n\
-    image2d_array_t meanVari,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int rsFlg)\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int rsFlg)\n\
 {\n\
     int gidz = get_global_id(1);\n\
     int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\
@@ -7357,12 +10368,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to
     VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
 \n\
     bias_f = read_imagef(bias, coord_para);\n\
-    coord_para.x = 0;\n\
-    coord_para.y = gidz;\n\
     for(int i = 0; i < group_num; i++)\n\
     {\n\
-        mean_vari += read_imagef(meanVari, coord_para);\n\
-        coord_para.x += 4;\n\
+        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
+        coord_para.y += 4;\n\
     }\n\
     mean_vari *= dimRatio;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
@@ -7371,47 +10380,43 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to
     scale_vari = scale_f.s0 * mean_vari.s1;\n\
     short zp = inputZP;\n\
     vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
     float alpha = scale_inOut * scale_vari;\n\
     bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\
 \n\
-    for(coord.y = 0; coord.y < height;coord.y++)\n\
+    int8 input_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr);\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
     {\n\
-    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvert1stUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvert2ndUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvert3rdUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvert4thUint8SubZpToFp32_4x4);\n\
-    vxc_float4 norm;\n\
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
     norm = tmpData0 * alpha + bias_val;\n\
     tmpVal0 = convert_int4_rte(norm);\n\
     norm = tmpData1 * alpha + bias_val;\n\
     tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-        uniConvertInt32toUint8_2x8);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
     norm = tmpData2 * alpha + bias_val;\n\
     tmpVal0 = convert_int4_rte(norm);\n\
     norm = tmpData3 * alpha + bias_val;\n\
     tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\
-        uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord.xyww, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
     }\n\
 }\n\
 \n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toU8_2D(\n\
-    image2d_array_t input,\n\
-    image2d_array_t bias,\n\
-    image2d_array_t scale,\n\
-    image2d_array_t meanVari,\n\
-    image2d_array_t output,\n\
-              float eps,\n\
-              int rsFlg)\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int rsFlg)\n\
 {\n\
     int gidz = get_global_id(1);\n\
     int gidy = gidz * height;\n\
@@ -7430,12 +10435,10 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to
     VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
 \n\
     bias_f = read_imagef(bias, coord_para);\n\
-    coord_para.x = 0;\n\
-    coord_para.y = gidz;\n\
     for(int i = 0; i < group_num; i++)\n\
     {\n\
-        mean_vari += read_imagef(meanVari, coord_para);\n\
-        coord_para.x += 4;\n\
+        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
+        coord_para.y += 4;\n\
     }\n\
     mean_vari *= dimRatio;\n\
     mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
@@ -7444,39 +10447,180 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8to
     scale_vari = scale_f.s0 * mean_vari.s1;\n\
     short zp = inputZP;\n\
     vxc_int4 tmpVal0, tmpVal1;\n\
-    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
     float alpha = scale_inOut * scale_vari;\n\
     bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\
 \n\
     for(; coord.y < endH; coord.y++)\n\
     {\n\
-    VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0),\\\n\
-        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvert1stUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvert2ndUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvert3rdUint8SubZpToFp32_4x4);\n\
-    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-        uniConvert4thUint8SubZpToFp32_4x4);\n\
-    vxc_float4 norm;\n\
+    VXC_ReadImage(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
     norm = tmpData0 * alpha + bias_val;\n\
     tmpVal0 = convert_int4_rte(norm);\n\
     norm = tmpData1 * alpha + bias_val;\n\
     tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-        uniConvertInt32toUint8_2x8);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
     norm = tmpData2 * alpha + bias_val;\n\
     tmpVal0 = convert_int4_rte(norm);\n\
     norm = tmpData3 * alpha + bias_val;\n\
     tmpVal1 = convert_int4_rte(norm);\n\
-    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\
-        uniConvertInt32toUint8_2x8);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
     VXC_WriteImage(output, coord, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
     }\n\
 }"; /* end of instance_normalization_u8_vx*/
 
+static const char instance_normalization_u8_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int width;\n\
+_viv_uniform int height;\n\
+_viv_uniform float dimRatio;\n\
+_viv_uniform int group_num;\n\
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\
+_viv_uniform float input_scale;\n\
+_viv_uniform int inputZP;\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16(\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int rsFlg)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, 0);\n\
+    int4 coord_para = (int4)(gidz, 0, 0, 0);\n\
+    vxc_uchar16 src0;\n\
+    vxc_short8 src1, outval;\n\
+    vxc_half8 scale_h, dst;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\
+\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, scale_h, src1, 16);\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
+    bias_f = read_imagef(bias, coord_para);\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
+        coord_para.y += 4;\n\
+    }\n\
+    mean_vari *= dimRatio;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    short zp = inputZP;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    float alpha = input_scale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
+\n\
+    coord_para = coord;\n\
+    int8 input_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(1) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(1) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_para.z, baseAddr);\n\
+    for(coord.y = 0; coord.y < height;)\n\
+    {\n\
+    VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    coord_para.xy = coord.xy;\n\
+    coord.y++;\n\
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
+    norm = alpha * tmpData0 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData1 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    coord_para.x += 8;\n\
+    norm = alpha * tmpData2 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData3 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_U8toF16_2D(\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int rsFlg)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int gidy = gidz * height;\n\
+    int4 coord = (int4)(get_global_id(0), gidy, 0, 0);\n\
+    int4 coord_para = (int4)(gidz, 0, 0, 0);\n\
+    int endH = gidy + height;\n\
+    vxc_uchar16 src0;\n\
+    vxc_short8 src1, outval;\n\
+    vxc_half8 scale_h, dst;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f, mean_vari = (vxc_float4)(0);\n\
+\n\
+    VXC_ReadImage(src1, scale, coord_para.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    _viv_asm(COPY, scale_h, src1, 16);\n\
+    VXC_DP4x4(scale_f, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), UniFP16toFP32Lo4_dp4x4);\n\
+    bias_f = read_imagef(bias, coord_para);\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += read_imagef(meanVari, coord_para.yx);\n\
+        coord_para.y += 4;\n\
+    }\n\
+    mean_vari *= dimRatio;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    short zp = inputZP;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    float alpha = input_scale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
+    for(; coord.y < endH;)\n\
+    {\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    coord_para = coord;\n\
+    coord.y++;\n\
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
+    norm = alpha * tmpData0 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData1 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    coord_para.x += 8;\n\
+    norm = alpha * tmpData2 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData3 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+"; /* end of instance_normalization_u8_f16_vx*/
+
 static const char l2normalizescale_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 #define VXC_Vstore3(Pointer, Offset, Data)   \\\n\
@@ -8738,6 +11882,680 @@ __kernel void layer_norm_I16toI16_2D(\n\
 }\n\
 "; /* end of layer_normalization_i16_vx*/
 
+static const char layer_normalization_scale_f32_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+/**************************layernorm float16***********************************/\n\
+_viv_uniform int width;\n\
+_viv_uniform float dimRatio;\n\
+_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
+_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;\n\
+\n\
+__kernel void layer_norm_F16F32toF16(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale,\n\
+    image2d_array_t output, float eps)\n\
+{\n\
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
+\n\
+    int8 input_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+\n\
+    vxc_short8 src0;\n\
+    vxc_float sum = 0, sqr = 0;\n\
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    Image img1 = create_image_from_image2d(bias, 4);\n\
+    Image img2 = create_image_from_image2d(scale, 4);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\
+    {\n\
+        vxc_half8  val0_h;\n\
+        _viv_asm(COPY, val0_h, src0, 16);\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        vxc_float4 sumsqr;\n\
+        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniFp16SumSqr_dp8x2);\n\
+        sum += sumsqr.x;\n\
+        sqr += sumsqr.y;\n\
+    }\n\
+    vxc_float mean;\n\
+    mean = sum * dimRatio;\n\
+    vxc_float vari;\n\
+    vari = sqr*dimRatio - mean*mean;\n\
+    vari += eps;\n\
+    vari = rsqrt(vari);\n\
+    vxc_float4 bias_f, scale_f, in_f;\n\
+    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww);\n\
+    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww);\n\
+    for(coord.x = 0; coord.x < width; coord.x += 4)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f = vload4(0, bias_ptr + coord.x);\n\
+        scale_f = vload4(0, scale_ptr + coord.x);\n\
+        vxc_half8 in_h;\n\
+        _viv_asm(COPY, in_h, src0, 16);\n\
+        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        vxc_float4 sub, norm;\n\
+        sub = in_f - mean;\n\
+        norm = scale_f * vari * sub + bias_f;\n\
+        half4 norm_h;\n\
+        _viv_asm(CONV, norm_h, norm);\n\
+        vxc_half8 dst;\n\
+        VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniExtractHalf4_dp4x4);\n\
+        vxc_short8 dstval;\n\
+        _viv_asm(COPY, dstval, dst, 16);\n\
+        coord_out.x = coord.x;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dstval, \\\n\
+                VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+/*****************************layernorm uint8 to uint8****************************/\n\
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniSumU8_16x1;\n\
+_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\
+_viv_uniform float input_scale;\n\
+_viv_uniform int inputZP;\n\
+_viv_uniform float outputScale;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform int sumInZp;\n\
+_viv_uniform int tmpZp1;\n\
+_viv_uniform int tmpZp2;\n\
+_viv_uniform float e2InScale;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\
+_viv_uniform float dimRatio_scale;\n\
+\n\
+__kernel void layer_norm_U8F32toU8(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale,\n\
+    image2d_array_t output, float eps)\n\
+{\n\
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
+\n\
+    vxc_uchar16 src0, src2;\n\
+    float sum = 0, sqr = 0;\n\
+    vxc_float4 bias_f0, bias_f1, bias_f2, bias_f3, scale_f0, scale_f1, scale_f2, scale_f3;\n\
+    int tmpSum = 0, tmpSqr = 0;\n\
+    vxc_int4 tmpSum1;\n\
+    vxc_int4 tmpSqr1;\n\
+    short zp = inputZP;\n\
+\n\
+    int8 input_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
+        tmpSum += (tmpSum1.x);\n\
+        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
+        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\
+    }\n\
+    sum = (tmpSum + sumInZp) * input_scale;\n\
+    sqr = (tmpSqr + tmpZp2) * e2InScale;\n\
+\n\
+    float mean, vari;\n\
+    mean = sum * dimRatio;\n\
+    vari = sqr*dimRatio - mean*mean;\n\
+    vari += eps;\n\
+    vari = rsqrt(vari);\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;\n\
+\n\
+    Image img1 = create_image_from_image2d(bias, 4);\n\
+    Image img2 = create_image_from_image2d(scale, 4);\n\
+    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww);\n\
+    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww);\n\
+    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f0 = vload4(0, bias_ptr);\n\
+        bias_f1 = vload4(1, bias_ptr);\n\
+        bias_f2 = vload4(2, bias_ptr);\n\
+        bias_f3 = vload4(3, bias_ptr);\n\
+        scale_f0 = vload4(0, scale_ptr);\n\
+        scale_f1 = vload4(1, scale_ptr);\n\
+        scale_f2 = vload4(2, scale_ptr);\n\
+        scale_f3 = vload4(3, scale_ptr);\n\
+        bias_ptr += 16;\n\
+        scale_ptr += 16;\n\
+\n\
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert1stUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert2ndUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert3rdUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert4thUint8SubZpToFp32_4x4);\n\
+        tmpData0 *= input_scale;\n\
+        tmpData1 *= input_scale;\n\
+        tmpData2 *= input_scale;\n\
+        tmpData3 *= input_scale;\n\
+\n\
+        vxc_float4 norm;\n\
+        tmpData0 -= mean;\n\
+        norm = scale_f0 * vari * tmpData0 + bias_f0;\n\
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
+\n\
+        tmpData1 -= mean;\n\
+        norm = scale_f1 * vari * tmpData1 + bias_f1;\n\
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
+        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+            uniConvertInt32toUint8_2x8);\n\
+\n\
+        tmpData2 -= mean;\n\
+        norm = scale_f2 * vari * tmpData2 + bias_f2;\n\
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
+\n\
+        tmpData3 -= mean;\n\
+        norm = scale_f3 * vari * tmpData3 + bias_f3;\n\
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
+        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\
+            uniConvertInt32toUint8_2x8);\n\
+        coord_out.x = coord.x;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \\\n\
+                VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel void layer_norm_I16F32toI16(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale,\n\
+    image2d_array_t output, float eps)\n\
+{\n\
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    int8 input_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr);\n\
+\n\
+    vxc_short8 src0, dst;\n\
+    vxc_float sum = 0, sqr = 0;\n\
+    for(; coord.x < width;)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord.x += 8;\n\
+        vxc_float4 sumsqr;\n\
+        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
+                    uniInt16SumSqr_dp8x2);\n\
+        sum += sumsqr.x;\n\
+        sqr = sqr + sumsqr.y * e2InScale;\n\
+    }\n\
+    vxc_float mean;\n\
+    mean = sum * dimRatio_scale;\n\
+    vxc_float vari;\n\
+    vari = sqr*dimRatio - mean*mean;\n\
+    vari += eps;\n\
+    vari = rsqrt(vari);\n\
+\n\
+    short zp = inputZP;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+\n\
+    int2 coord_bias = (int2)(0, 0);\n\
+    Image img1 = create_image_from_image2d(bias, 4);\n\
+    Image img2 = create_image_from_image2d(scale, 4);\n\
+    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord_bias);\n\
+    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord_bias);\n\
+    for(coord.x = 0; coord.x < width; coord.x += 8)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f0 = vload4(0, bias_ptr);\n\
+        bias_f1 = vload4(1, bias_ptr);\n\
+        scale_f0 = vload4(0, scale_ptr);\n\
+        scale_f1 = vload4(1, scale_ptr);\n\
+        bias_ptr += 8;\n\
+        scale_ptr += 8;\n\
+\n\
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvert1stUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvert2ndUint8SubZpToFp32_4x4);\n\
+\n\
+        vxc_float4 sub, norm;\n\
+        sub = tmpData0 * input_scale - mean;\n\
+        norm = scale_f0 * vari * sub + bias_f0;\n\
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
+        sub = tmpData1 * input_scale - mean;\n\
+        norm = scale_f1 * vari * sub + bias_f1;\n\
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
+\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+                uniConvertInt32toUint8_2x8);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord.xyww, dst, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}"; /* end of layer_normalization_scale_f32_vx*/
+
+static const char layer_normalization_scale_f32_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+/**************************layernorm float16***********************************/\n\
+_viv_uniform int width;\n\
+_viv_uniform float dimRatio;\n\
+_viv_uniform VXC_512Bits uniFp16SumSqr_dp8x2;\n\
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
+_viv_uniform VXC_512Bits uniExtractHalf4_dp4x4;\n\
+\n\
+__kernel void layer_norm_F16F32toF16_2D(\n\
+    image2d_t input, image2d_t bias, image2d_t scale,\n\
+    image2d_t output, float eps)\n\
+{\n\
+    int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\
+    vxc_short8 src0, src1;\n\
+    vxc_float sum = 0, sqr = 0;\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    Image img1 = create_image_from_image2d(bias, 4);\n\
+    Image img2 = create_image_from_image2d(scale, 4);\n\
+\n\
+    for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\
+    {\n\
+        vxc_half8  val0_h;\n\
+        _viv_asm(COPY, val0_h, src0, 16);\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        vxc_float4 sumsqr;\n\
+        VXC_DP8x2(sumsqr, val0_h, val0_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniFp16SumSqr_dp8x2);\n\
+        sum += sumsqr.x;\n\
+        sqr += sumsqr.y;\n\
+    }\n\
+    vxc_float mean;\n\
+    mean = sum * dimRatio;\n\
+    vxc_float vari;\n\
+    vari = sqr*dimRatio - mean*mean;\n\
+    vari += eps;\n\
+    vari = rsqrt(vari);\n\
+    vxc_float4 bias_f, scale_f, in_f;\n\
+    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw);\n\
+    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw);\n\
+    for(coord.x = 0; coord.x < width; coord.x += 4)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f = vload4(0, bias_ptr + coord.x);\n\
+        scale_f = vload4(0, scale_ptr + coord.x);\n\
+\n\
+        vxc_half8 in_h;\n\
+        _viv_asm(COPY, in_h, src0, 16);\n\
+        VXC_DP4x4(in_f, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+        vxc_float4 sub, norm;\n\
+        sub = in_f - mean;\n\
+        norm = scale_f * vari * sub + bias_f;\n\
+        half4 norm_h;\n\
+        _viv_asm(CONV, norm_h, norm);\n\
+        vxc_half8 dst;\n\
+        VXC_DP4x4(dst, norm_h, norm_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniExtractHalf4_dp4x4);\n\
+        vxc_short8 dstval;\n\
+        _viv_asm(COPY, dstval, dst, 16);\n\
+        VXC_WriteImage(output, coord.xy, dstval, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+/*****************************layernorm uint8 to uint8****************************/\n\
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert4thUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniSumU8_16x1;\n\
+_viv_uniform VXC_512Bits uniSqrSum_16x1;\n\
+_viv_uniform float input_scale;\n\
+_viv_uniform int inputZP;\n\
+_viv_uniform float outputScale;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform int sumInZp;\n\
+_viv_uniform int tmpZp1;\n\
+_viv_uniform int tmpZp2;\n\
+_viv_uniform float e2InScale;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniInt16SumSqr_dp8x2;\n\
+_viv_uniform float dimRatio_scale;\n\
+\n\
+__kernel void layer_norm_U8F32toU8_2D(\n\
+    image2d_t input, image2d_t bias, image2d_t scale,\n\
+    image2d_t output, float eps)\n\
+{\n\
+    int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\
+    vxc_uchar16 src0, src2;\n\
+    float sum = 0, sqr = 0;\n\
+    vxc_float4 bias_f0, bias_f1, bias_f2, bias_f3, scale_f0, scale_f1, scale_f2, scale_f3;\n\
+    int tmpSum = 0, tmpSqr = 0;\n\
+    vxc_int4 tmpSum1;\n\
+    vxc_int4 tmpSqr1;\n\
+    short zp = inputZP;\n\
+\n\
+    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumU8_16x1);\n\
+        tmpSum += (tmpSum1.x);\n\
+        VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSqrSum_16x1);\n\
+        tmpSqr += (tmpSqr1.x + tmpZp1 * tmpSum1.x);\n\
+    }\n\
+    sum = (tmpSum + sumInZp) * input_scale;\n\
+    sqr = (tmpSqr + tmpZp2) * e2InScale;\n\
+\n\
+    float mean, vari;\n\
+    mean = sum * dimRatio;\n\
+    vari = sqr*dimRatio - mean*mean;\n\
+    vari += eps;\n\
+    vari = rsqrt(vari);\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3;\n\
+\n\
+    Image img1 = create_image_from_image2d(bias, 4);\n\
+    Image img2 = create_image_from_image2d(scale, 4);\n\
+    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw);\n\
+    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw);\n\
+    for(coord.x = 0; coord.x < width; coord.x += 16)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f0 = vload4(0, bias_ptr);\n\
+        bias_f1 = vload4(1, bias_ptr);\n\
+        bias_f2 = vload4(2, bias_ptr);\n\
+        bias_f3 = vload4(3, bias_ptr);\n\
+        scale_f0 = vload4(0, scale_ptr);\n\
+        scale_f1 = vload4(1, scale_ptr);\n\
+        scale_f2 = vload4(2, scale_ptr);\n\
+        scale_f3 = vload4(3, scale_ptr);\n\
+        bias_ptr += 16;\n\
+        scale_ptr += 16;\n\
+\n\
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert1stUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert2ndUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert3rdUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvert4thUint8SubZpToFp32_4x4);\n\
+        tmpData0 = tmpData0 * input_scale - mean;\n\
+        tmpData1 = tmpData1 * input_scale - mean;\n\
+        tmpData2 = tmpData2 * input_scale - mean;\n\
+        tmpData3 = tmpData3 * input_scale - mean;\n\
+\n\
+        vxc_float4 norm;\n\
+        norm = scale_f0 * vari * tmpData0 + bias_f0;\n\
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
+\n\
+        norm = scale_f1 * vari * tmpData1 + bias_f1;\n\
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
+        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+            uniConvertInt32toUint8_2x8);\n\
+\n\
+        norm = scale_f2 * vari * tmpData2 + bias_f2;\n\
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
+\n\
+        norm = scale_f3 * vari * tmpData3 + bias_f3;\n\
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
+        VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1),\\\n\
+            uniConvertInt32toUint8_2x8);\n\
+        VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel void layer_norm_I16F32toI16_2D(\n\
+    image2d_t input, image2d_t bias, image2d_t scale,\n\
+    image2d_t output, float eps)\n\
+{\n\
+    int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\
+\n\
+    vxc_short8 src0, src1, dst;\n\
+    vxc_float sum = 0, sqr = 0;\n\
+    for(; coord.x < width;)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord.x += 8;\n\
+        vxc_float4 sumsqr;\n\
+        VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
+                    uniInt16SumSqr_dp8x2);\n\
+        sum += sumsqr.x;\n\
+        sqr = sqr + sumsqr.y * e2InScale;\n\
+    }\n\
+    vxc_float mean, vari;\n\
+    mean = sum * dimRatio_scale;\n\
+    vari = sqr * dimRatio - mean * mean;\n\
+    vari += eps;\n\
+    vari = rsqrt(vari);\n\
+\n\
+    short zp = inputZP;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_half8 scale_h;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+\n\
+    Image img1 = create_image_from_image2d(bias, 4);\n\
+    Image img2 = create_image_from_image2d(scale, 4);\n\
+\n\
+    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw);\n\
+    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw);\n\
+    for(coord.x = 0; coord.x < width; coord.x += 8)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f0 = vload4(0, bias_ptr);\n\
+        bias_f1 = vload4(1, bias_ptr);\n\
+        scale_f0 = vload4(0, scale_ptr);\n\
+        scale_f1 = vload4(1, scale_ptr);\n\
+        bias_ptr += 8;\n\
+        scale_ptr += 8;\n\
+\n\
+        VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvert1stUint8SubZpToFp32_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniConvert2ndUint8SubZpToFp32_4x4);\n\
+\n\
+        vxc_float4 sub, norm;\n\
+        sub = tmpData0 * input_scale - mean;\n\
+        norm = scale_f0 * vari * sub + bias_f0;\n\
+        tmpVal0 = convert_int4_rte(norm * outputScale + output_zp);\n\
+        sub = tmpData1 * input_scale - mean;\n\
+        norm = scale_f1 * vari * sub + bias_f1;\n\
+        tmpVal1 = convert_int4_rte(norm * outputScale + output_zp);\n\
+\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+                    uniConvertInt32toUint8_2x8);\n\
+        VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}"; /* end of layer_normalization_scale_f32_2d_vx*/
+
+static const char layer_normalization_scale_f32_bf16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+/**************************layernorm float16***********************************/\n\
+_viv_uniform int width;\n\
+_viv_uniform float dimRatio;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
+\n\
+__kernel void layer_norm_BF16F32toBF16(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale,\n\
+    image2d_array_t output, float eps)\n\
+{\n\
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
+\n\
+    int8 input_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    float4 ones = (float4)(1.0, 1.0, 1.0, 1.0);\n\
+    vxc_ushort8 src0, src1, src2;\n\
+    vxc_float sum = 0, sqr = 0;\n\
+    VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    Image img1 = create_image_from_image2d(bias, 4);\n\
+    Image img2 = create_image_from_image2d(scale, 4);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+    float4 srcA, srcB;\n\
+    for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\
+    {\n\
+        VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                     uniConvBF16toF32_Part0_2x8);\n\
+        VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                     uniConvBF16toF32_Part1_2x8);\n\
+        _viv_asm(COPY, srcA, src1, 16);\n\
+        _viv_asm(COPY, srcB, src2, 16);\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        sum += dot(srcA, ones) + dot(srcB, ones);\n\
+        sqr += dot(srcA * srcA, ones) + dot(srcB * srcB, ones);\n\
+    }\n\
+    vxc_float mean;\n\
+    mean = sum * dimRatio;\n\
+    vxc_float vari;\n\
+    vari = sqr*dimRatio - mean*mean;\n\
+    vari += eps;\n\
+    vari = rsqrt(vari);\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.ww);\n\
+    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.ww);\n\
+\n\
+    for(coord.x = 0; coord.x < width; coord.x += 8)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xyzz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f0 = vload4(0, bias_ptr);\n\
+        bias_f1 = vload4(1, bias_ptr);\n\
+        scale_f0 = vload4(0, scale_ptr);\n\
+        scale_f1 = vload4(1, scale_ptr);\n\
+        bias_ptr += 8;\n\
+        scale_ptr += 8;\n\
+\n\
+        VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                     uniConvBF16toF32_Part0_2x8);\n\
+        VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                     uniConvBF16toF32_Part1_2x8);\n\
+        _viv_asm(COPY, srcA, src1, 16);\n\
+        _viv_asm(COPY, srcB, src2, 16);\n\
+\n\
+\n\
+        vxc_float4 sub0, sub1, norm0, norm1;\n\
+        sub0 = srcA - mean;\n\
+        sub1 = srcB - mean;\n\
+        norm0 = scale_f0 * vari * sub0 + bias_f0;\n\
+        norm1 = scale_f1 * vari * sub1 + bias_f1;\n\
+\n\
+        _viv_asm(COPY, src0, norm0, 16);\n\
+        _viv_asm(COPY, src1, norm1, 16);\n\
+        VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+\n\
+        coord_out.x = coord.x;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, src2, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel void layer_norm_BF16F32toBF16_2D(\n\
+    image2d_t input, image2d_t bias, image2d_t scale,\n\
+    image2d_t output, float eps)\n\
+{\n\
+    int4 coord = (int4)(0, get_global_id(1), 0, 0);\n\
+    vxc_ushort8 src0, src1, src2;\n\
+    vxc_float sum = 0, sqr = 0;\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    float4 ones = (float4)(1.0, 1.0, 1.0, 1.0);\n\
+    Image img1 = create_image_from_image2d(bias, 4);\n\
+    Image img2 = create_image_from_image2d(scale, 4);\n\
+    float4 srcA, srcB;\n\
+    for(coord.x = 8; coord.x < (width+8); coord.x += 8)\n\
+    {\n\
+        VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                     uniConvBF16toF32_Part0_2x8);\n\
+        VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                     uniConvBF16toF32_Part1_2x8);\n\
+        _viv_asm(COPY, srcA, src1, 16);\n\
+        _viv_asm(COPY, srcB, src2, 16);\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        sum += dot(srcA, ones) + dot(srcB, ones);\n\
+        sqr += dot(srcA * srcA, ones) + dot(srcB * srcB, ones);\n\
+    }\n\
+    vxc_float mean;\n\
+    mean = sum * dimRatio;\n\
+    vxc_float vari;\n\
+    vari = sqr*dimRatio - mean*mean;\n\
+    vari += eps;\n\
+    vari = rsqrt(vari);\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    __global float* bias_ptr = (__global float*)get_image_ptr_from_coord(img1, coord.zw);\n\
+    __global float* scale_ptr = (__global float*)get_image_ptr_from_coord(img2, coord.zw);\n\
+    for(coord.x = 0; coord.x < width; coord.x += 8)\n\
+    {\n\
+        VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f0 = vload4(0, bias_ptr);\n\
+        bias_f1 = vload4(1, bias_ptr);\n\
+        scale_f0 = vload4(0, scale_ptr);\n\
+        scale_f1 = vload4(1, scale_ptr);\n\
+        bias_ptr += 8;\n\
+        scale_ptr += 8;\n\
+\n\
+        VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                     uniConvBF16toF32_Part0_2x8);\n\
+        VXC_DP2x8(src2, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                     uniConvBF16toF32_Part1_2x8);\n\
+        _viv_asm(COPY, srcA, src1, 16);\n\
+        _viv_asm(COPY, srcB, src2, 16);\n\
+\n\
+        vxc_float4 sub0, sub1, norm0, norm1;\n\
+        sub0 = srcA - mean;\n\
+        sub1 = srcB - mean;\n\
+        norm0 = scale_f0 * vari * sub0 + bias_f0;\n\
+        norm1 = scale_f1 * vari * sub1 + bias_f1;\n\
+\n\
+        _viv_asm(COPY, src0, norm0, 16);\n\
+        _viv_asm(COPY, src1, norm1, 16);\n\
+        VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+        VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}"; /* end of layer_normalization_scale_f32_bf16_vx*/
+
 static const char layer_normalization_u8_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 /*****************************layernorm uint8 to fp16****************************/\n\
@@ -20838,6 +24656,213 @@ __kernel void moments_axis2_F16toF16(\n\
     VXC_WriteImage(output_vari, coord_out, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
 }"; /* end of moments_axis2_vx*/
 
+static const char one_hot_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniDataConvert_0_4x4;\n\
+_viv_uniform VXC_512Bits uniDataConvert_1_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform int depth;\n\
+#define ONE_HOT_SH_IMPL(name0, name1, src_type, copy_type, dst_type) \\\n\
+__kernel void one_hot_##name0##to##name1 \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int             suffix_sz, \\\n\
+                 int             on_val, \\\n\
+                 int             off_val \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\
+ \\\n\
+    copy_type src; \\\n\
+    src_type  val; \\\n\
+ \\\n\
+    VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src, val, 16); \\\n\
+ \\\n\
+    int4 data0, data1; \\\n\
+    VXC_DP4x4(data0, src, src, \\\n\
+         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_0_4x4); \\\n\
+    VXC_DP4x4(data1, src, src, \\\n\
+         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_1_4x4); \\\n\
+ \\\n\
+    do \\\n\
+    { \\\n\
+        int4 d0 = data0 == coord.zzzz ? on_val : off_val; \\\n\
+        int4 d1 = data1 == coord.zzzz ? on_val : off_val; \\\n\
+ \\\n\
+        dst_type dst; \\\n\
+        VXC_DP2x8(dst, d0, d1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+ \\\n\
+        VXC_WriteImage2DArray(output, coord.xzyw, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+        coord.z ++; \\\n\
+    } while (coord.z < depth); \\\n\
+}\n\
+ONE_HOT_SH_IMPL(F16, F16, vxc_ushort8, vxc_half8,  vxc_ushort8)\n\
+ONE_HOT_SH_IMPL(F16, I16, vxc_ushort8, vxc_half8,  vxc_ushort8)\n\
+ONE_HOT_SH_IMPL(F16, I8,  vxc_ushort8, vxc_half8,  vxc_uchar8)\n\
+ONE_HOT_SH_IMPL(F16, U8,  vxc_ushort8, vxc_half8,  vxc_uchar8)\n\
+ONE_HOT_SH_IMPL(I16, F16, vxc_short8,  vxc_short8, vxc_ushort8)\n\
+ONE_HOT_SH_IMPL(I16, I16, vxc_short8,  vxc_short8, vxc_ushort8)\n\
+ONE_HOT_SH_IMPL(I8,  F16, vxc_char8,   vxc_char8,  vxc_ushort8)\n\
+ONE_HOT_SH_IMPL(I8,  I8,  vxc_char8,   vxc_char8,  vxc_uchar8)\n\
+\n\
+#define ONE_HOT_SH_IMPL_2D(name0, name1, src_type, copy_type, dst_type) \\\n\
+__kernel void one_hot_##name0##to##name1##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int             suffix_sz, \\\n\
+                 int             on_val, \\\n\
+                 int             off_val \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord =  (int4)(get_global_id(0), 0, get_global_id(0), get_global_id(0)); \\\n\
+ \\\n\
+    copy_type src; \\\n\
+    src_type  val; \\\n\
+ \\\n\
+    VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src, val, 16); \\\n\
+ \\\n\
+    int4 data, data0, data1; \\\n\
+    VXC_DP4x4(data, src, src, \\\n\
+         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_0_4x4); \\\n\
+    int4 d4 = (int4)(0, 1, 2, 3); \\\n\
+ \\\n\
+    do \\\n\
+    { \\\n\
+        coord.zw = coord.xx + (int2)(0, 1); \\\n\
+        dst_type dst; \\\n\
+        data0 = data.xxxx == d4 ? on_val : off_val; \\\n\
+        data1 = data.yyyy == d4 ? on_val : off_val; \\\n\
+ \\\n\
+        VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+ \\\n\
+        VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord.zw = coord.zw + (int2)(2, 2); \\\n\
+ \\\n\
+        data0 = data.zzzz == d4 ? on_val : off_val; \\\n\
+        data1 = data.wwww == d4 ? on_val : off_val; \\\n\
+ \\\n\
+        VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+ \\\n\
+        VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+        d4 += 4; \\\n\
+        coord.y += 4; \\\n\
+    } while (coord.y < depth); \\\n\
+}\n\
+ONE_HOT_SH_IMPL_2D(F16, F16, vxc_ushort8, vxc_half8,  vxc_ushort8)\n\
+ONE_HOT_SH_IMPL_2D(F16, I16, vxc_ushort8, vxc_half8,  vxc_ushort8)\n\
+ONE_HOT_SH_IMPL_2D(F16, I8,  vxc_ushort8, vxc_half8,  vxc_uchar8)\n\
+ONE_HOT_SH_IMPL_2D(F16, U8,  vxc_ushort8, vxc_half8,  vxc_uchar8)\n\
+ONE_HOT_SH_IMPL_2D(I16, F16, vxc_short8,  vxc_short8, vxc_ushort8)\n\
+ONE_HOT_SH_IMPL_2D(I16, I16, vxc_short8,  vxc_short8, vxc_ushort8)\n\
+ONE_HOT_SH_IMPL_2D(I8,  F16, vxc_char8,   vxc_char8,  vxc_ushort8)\n\
+ONE_HOT_SH_IMPL_2D(I8,  I8,  vxc_char8,   vxc_char8,  vxc_uchar8)\n\
+\n\
+_viv_uniform float input_scale;\n\
+_viv_uniform float input_tail;\n\
+#define ONE_HOT_ASYM_SH_IMPL(name0, name1, src_type, copy_type, dst_type) \\\n\
+__kernel void one_hot_##name0##to##name1 \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int             suffix_sz, \\\n\
+                 int             on_val, \\\n\
+                 int             off_val \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\
+ \\\n\
+    copy_type src; \\\n\
+    src_type  val; \\\n\
+ \\\n\
+    VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src, val, 16); \\\n\
+ \\\n\
+    int4 data0, data1; \\\n\
+    float4 v0, v1; \\\n\
+    VXC_DP4x4(v0, src, src, \\\n\
+         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_0_4x4); \\\n\
+    VXC_DP4x4(v1, src, src, \\\n\
+         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_1_4x4); \\\n\
+ \\\n\
+    data0 = convert_int4(v0 * input_scale + input_tail); \\\n\
+    data1 = convert_int4(v1 * input_scale + input_tail); \\\n\
+    do \\\n\
+    { \\\n\
+        int4 d0 = data0 == coord.zzzz ? on_val : off_val; \\\n\
+        int4 d1 = data1 == coord.zzzz ? on_val : off_val; \\\n\
+ \\\n\
+        dst_type dst; \\\n\
+        VXC_DP2x8(dst, d0, d1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+ \\\n\
+        VXC_WriteImage2DArray(output, coord.xzyw, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+        coord.z ++; \\\n\
+    } while (coord.z < depth); \\\n\
+}\n\
+ONE_HOT_ASYM_SH_IMPL(U8,  F16, vxc_uchar8,  vxc_uchar8, vxc_ushort8)\n\
+ONE_HOT_ASYM_SH_IMPL(U8,  U8,  vxc_uchar8,  vxc_uchar8, vxc_uchar8)\n\
+\n\
+#define ONE_HOT_ASYM_SH_IMPL_2D(name0, name1, src_type, copy_type, dst_type) \\\n\
+__kernel void one_hot_##name0##to##name1##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int             suffix_sz, \\\n\
+                 int             on_val, \\\n\
+                 int             off_val \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord =  (int4)(get_global_id(0), 0, get_global_id(0), get_global_id(0)); \\\n\
+ \\\n\
+    copy_type src; \\\n\
+    src_type  val; \\\n\
+ \\\n\
+    VXC_ReadImage(val, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src, val, 16); \\\n\
+ \\\n\
+    int4 data, data0, data1; \\\n\
+    float4 v0; \\\n\
+    VXC_DP4x4(v0, src, src, \\\n\
+         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataConvert_0_4x4); \\\n\
+    int4 d4 = (int4)(0, 1, 2, 3); \\\n\
+    data = convert_int4(v0 * input_scale + input_tail); \\\n\
+ \\\n\
+    do \\\n\
+    { \\\n\
+        coord.zw = coord.xx + (int2)(0, 1); \\\n\
+        dst_type dst; \\\n\
+        data0 = data.xxxx == d4 ? on_val : off_val; \\\n\
+        data1 = data.yyyy == d4 ? on_val : off_val; \\\n\
+ \\\n\
+        VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+ \\\n\
+        VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+        coord.zw = coord.zw + (int2)(2, 2); \\\n\
+ \\\n\
+        data0 = data.zzzz == d4 ? on_val : off_val; \\\n\
+        data1 = data.wwww == d4 ? on_val : off_val; \\\n\
+ \\\n\
+        VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
+ \\\n\
+        VXC_WriteImage(output, coord.yz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
+        VXC_WriteImage(output, coord.yw, dst, VXC_MODIFIER(4, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+        d4 += 4; \\\n\
+        coord.y += 4; \\\n\
+    } while (coord.y < depth); \\\n\
+}\n\
+ONE_HOT_ASYM_SH_IMPL_2D(U8,  F16, vxc_uchar8,  vxc_uchar8, vxc_ushort8)\n\
+ONE_HOT_ASYM_SH_IMPL_2D(U8,  U8,  vxc_uchar8,  vxc_uchar8, vxc_uchar8)\n\
+\n\
+"; /* end of one_hot_vx*/
+
 static const char poolwithargmax_F16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 //-------------------max pooling with argmax---------------\n\
@@ -24667,10 +28692,16 @@ _viv_uniform int r_order;\n\
 _viv_uniform int b_order;\n\
 _viv_uniform VXC_512Bits uniExtractRtoF32_part0_4x4;\n\
 _viv_uniform VXC_512Bits uniExtractRtoF32_part1_4x4;\n\
+_viv_uniform VXC_512Bits uniExtractRtoF32_part2_4x4;\n\
+_viv_uniform VXC_512Bits uniExtractRtoF32_part3_4x4;\n\
 _viv_uniform VXC_512Bits uniExtractGtoF32_part0_4x4;\n\
 _viv_uniform VXC_512Bits uniExtractGtoF32_part1_4x4;\n\
+_viv_uniform VXC_512Bits uniExtractGtoF32_part2_4x4;\n\
+_viv_uniform VXC_512Bits uniExtractGtoF32_part3_4x4;\n\
 _viv_uniform VXC_512Bits uniExtractBtoF32_part0_4x4;\n\
 _viv_uniform VXC_512Bits uniExtractBtoF32_part1_4x4;\n\
+_viv_uniform VXC_512Bits uniExtractBtoF32_part2_4x4;\n\
+_viv_uniform VXC_512Bits uniExtractBtoF32_part3_4x4;\n\
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
 \n\
 #define IMAGE_PRE_PROCESS_COPY_16BITS(dst_name, dst_type, copy_type, convert_type) \\\n\
@@ -24692,13 +28723,14 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
 { \\\n\
     int2 coord      = (int2)(get_global_id(0) * 3, get_global_id(1)); \\\n\
  \\\n\
-    coord.xy += (int2) (*xOffset, *yOffset); \\\n\
+    coord.xy = coord.xy + (int2) (*xOffset * 3 + 16, *yOffset); \\\n\
     vxc_uchar16 src0, src1; \\\n\
     dst_type   dst0; \\\n\
     copy_type   dst; \\\n\
  \\\n\
-    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0), \\\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(-16, 0), \\\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
         VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
     f32Var *= outputScale; \\\n\
@@ -24709,7 +28741,7 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
     float4 tmp0, tmp1; \\\n\
     convert_type result0, result1; \\\n\
  \\\n\
-    VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \\\n\
+    VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \\\n\
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \\\n\
     tmp0 = tmp0 * paramData.w - paramData.x; \\\n\
     tmp1 = tmp1 * paramData.w - paramData.x; \\\n\
@@ -24720,7 +28752,7 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
     VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
     coord_out.z = 1; \\\n\
-    VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \\\n\
+    VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \\\n\
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \\\n\
     tmp0 = tmp0 * paramData.w - paramData.y; \\\n\
     tmp1 = tmp1 * paramData.w - paramData.y; \\\n\
@@ -24731,7 +28763,7 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
     VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
     coord_out.z = b_order; \\\n\
-    VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \\\n\
+    VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \\\n\
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \\\n\
     tmp0 = tmp0 * paramData.w - paramData.z; \\\n\
     tmp1 = tmp1 * paramData.w - paramData.z; \\\n\
@@ -24762,12 +28794,16 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
     ) \\\n\
 { \\\n\
     int2 coord      = (int2)(get_global_id(0) * 3, get_global_id(1)); \\\n\
-    coord.xy += (int2) (*xOffset, *yOffset); \\\n\
-    vxc_uchar16 src0, src1; \\\n\
+    coord.xy = coord.xy + (int2) (*xOffset * 3 + 16, *yOffset); \\\n\
+    vxc_uchar16 src0, src1, src2; \\\n\
     dst_type dst; \\\n\
  \\\n\
-    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(15, 0), \\\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(-16, 0), \\\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(src1, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
+        VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.x += 16; \\\n\
+    VXC_ReadImage(src2, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), \\\n\
         VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
     f32Var *= outputScale; \\\n\
@@ -24778,35 +28814,55 @@ __kernel void pre_process_rgb_copy_U8to##dst_name \\\n\
     float4 tmp0, tmp1; \\\n\
     int4 result0, result1; \\\n\
  \\\n\
-    VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \\\n\
+    VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part0_4x4); \\\n\
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part1_4x4); \\\n\
     tmp0 = tmp0 * paramData.w - paramData.x; \\\n\
     tmp1 = tmp1 * paramData.w - paramData.x; \\\n\
     result0 = convert_int4_rte(tmp0); \\\n\
     result1 = convert_int4_rte(tmp1); \\\n\
     VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
-    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
- \\\n\
+    VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part2_4x4); \\\n\
+    VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractRtoF32_part3_4x4); \\\n\
+    tmp0 = tmp0 * paramData.w - paramData.x; \\\n\
+    tmp1 = tmp1 * paramData.w - paramData.x; \\\n\
+    result0 = convert_int4_rte(tmp0); \\\n\
+    result1 = convert_int4_rte(tmp1); \\\n\
+    VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
     coord_out.z = 1; \\\n\
-    VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \\\n\
+    VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part0_4x4); \\\n\
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part1_4x4); \\\n\
     tmp0 = tmp0 * paramData.w - paramData.y; \\\n\
     tmp1 = tmp1 * paramData.w - paramData.y; \\\n\
     result0 = convert_int4_rte(tmp0); \\\n\
     result1 = convert_int4_rte(tmp1); \\\n\
     VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
-    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part2_4x4); \\\n\
+    VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractGtoF32_part3_4x4); \\\n\
+    tmp0 = tmp0 * paramData.w - paramData.y; \\\n\
+    tmp1 = tmp1 * paramData.w - paramData.y; \\\n\
+    result0 = convert_int4_rte(tmp0); \\\n\
+    result1 = convert_int4_rte(tmp1); \\\n\
+    VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
  \\\n\
     coord_out.z = b_order; \\\n\
-    VXC_DP4x4(tmp0, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \\\n\
+    VXC_DP4x4(tmp0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part0_4x4); \\\n\
     VXC_DP4x4(tmp1, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part1_4x4); \\\n\
     tmp0 = tmp0 * paramData.w - paramData.z; \\\n\
     tmp1 = tmp1 * paramData.w - paramData.z; \\\n\
     result0 = convert_int4_rte(tmp0); \\\n\
     result1 = convert_int4_rte(tmp1); \\\n\
     VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
-    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_DP4x4(tmp0, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part2_4x4); \\\n\
+    VXC_DP4x4(tmp1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractBtoF32_part3_4x4); \\\n\
+    tmp0 = tmp0 * paramData.w - paramData.z; \\\n\
+    tmp1 = tmp1 * paramData.w - paramData.z; \\\n\
+    result0 = convert_int4_rte(tmp0); \\\n\
+    result1 = convert_int4_rte(tmp1); \\\n\
+    VXC_DP2x8(dst, result0, result1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+    VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
 IMAGE_PRE_PROCESS_COPY_8BITS(U8, vxc_uchar16)\n\
 IMAGE_PRE_PROCESS_COPY_8BITS(I8, vxc_char16)\n\
@@ -24867,7 +28923,7 @@ __kernel void pre_process_yuv420_copy_U8toU8(\n\
     )\n\
 {\n\
     int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);\n\
-    int4 pos1 = (int4)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1, 0, 0);\n\
+    int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1);\n\
     vxc_uchar16 Y;\n\
     vxc_uchar8 U, V;\n\
     vxc_int4 C0, C1, C2, C3;\n\
@@ -24946,6 +29002,112 @@ __kernel void pre_process_yuv420_copy_U8toU8(\n\
     VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
+__kernel void pre_process_yuv420_copy_U8toF16(\n\
+    __read_only image2d_t            y_img,\n\
+    __read_only image2d_t            u_img,\n\
+    __read_only image2d_t            v_img,\n\
+    __write_only image2d_array_t    output,\n\
+        global int *                xRatio,\n\
+        global int *                yRatio,\n\
+        global int *               xOffset,\n\
+        global int *               yOffset,\n\
+               float                 rMean,\n\
+               float                 gMean,\n\
+               float                 bMean,\n\
+               float                   var,\n\
+               int         reverse_channel,\n\
+               int                   trans\n\
+    )\n\
+{\n\
+    int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);\n\
+    int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1);\n\
+    vxc_uchar16 Y;\n\
+    vxc_uchar8 U, V;\n\
+    vxc_int4 C0, C1, C2, C3;\n\
+    vxc_uchar16 R, G, B;\n\
+    vxc_half8 dst0, dst1, dst2, dst3, dst4, dst5;\n\
+    vxc_short8 out0, out1, out2, out3, out4, out5;\n\
+\n\
+    VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    //C = Y - 16;\n\
+    //D = U - 128;\n\
+    //E = V - 128;\n\
+    // calculate R\n\
+    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]\n\
+    int tmpV = -56992;\n\
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);\n\
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);\n\
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);\n\
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);\n\
+\n\
+    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
+    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
+    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
+    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
+\n\
+    // calculate G\n\
+    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\
+    // 298Y - 208V\n\
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);\n\
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);\n\
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);\n\
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);\n\
+    // 34784 - 100U\n\
+    ushort tmpG = 34784;\n\
+    vxc_ushort8 tmpDstG;\n\
+    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\
+    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);\n\
+    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);\n\
+    VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4);\n\
+    VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4);\n\
+\n\
+    // calculate B\n\
+    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\
+    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);\n\
+    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);\n\
+    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);\n\
+    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);\n\
+    tmpV = -70688;\n\
+    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
+    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
+    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
+    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
+\n\
+    float4  paramData = (float4)(bMean * var, gMean * var,\\\n\
+        rMean * var, var);\n\
+    half4 paramData_f16;\n\
+    _viv_asm(CONV, paramData_f16, paramData);\n\
+\n\
+    VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);\n\
+    VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);\n\
+\n\
+    VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);\n\
+    VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);\n\
+\n\
+    VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\
+    VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\
+\n\
+    _viv_asm(COPY, out0, dst0, 16);\n\
+    _viv_asm(COPY, out1, dst1, 16);\n\
+    _viv_asm(COPY, out2, dst2, 16);\n\
+    _viv_asm(COPY, out3, dst3, 16);\n\
+    _viv_asm(COPY, out4, dst4, 16);\n\
+    _viv_asm(COPY, out5, dst5, 16);\n\
+\n\
+    pos = (int4)(get_global_id(0), get_global_id(1), bOrder, get_global_id(0) + 8);\n\
+    VXC_WriteImage2DArray(output, pos.xyzz, out0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage2DArray(output, pos.wyzz, out1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    pos.z = 1;\n\
+    VXC_WriteImage2DArray(output, pos.xyzz, out2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage2DArray(output, pos.wyzz, out3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    pos.z = rOrder;\n\
+    VXC_WriteImage2DArray(output, pos.xyzz, out4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage2DArray(output, pos.wyzz, out5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
 "; /* end of pre_process_yuv420_copy_u8_vx*/
 
 static const char pre_process_yuv420_scale_fp16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -25919,7 +30081,7 @@ __kernel void pre_process_yuv444_copy_U8toU8(\n\
                int                   trans\n\
     )\n\
 {\n\
-    int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);\n\
+    int2 pos = (int2)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset));\n\
     vxc_uchar16 Y, U, V;\n\
     vxc_int4 C0, C1, C2, C3;\n\
     vxc_uchar16 R, G, B;\n\
@@ -25990,13 +30152,118 @@ __kernel void pre_process_yuv444_copy_U8toU8(\n\
     VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\
     VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\
 \n\
-    pos = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
-    pos.z = bOrder;\n\
-    VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    pos.z = 1;\n\
-    VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    pos.z = rOrder;\n\
-    VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    int4 pos1 = (int4)(get_global_id(0), get_global_id(1), bOrder, 0);\n\
+    VXC_WriteImage2DArray(output, pos1, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    pos1.z = 1;\n\
+    VXC_WriteImage2DArray(output, pos1, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    pos1.z = rOrder;\n\
+    VXC_WriteImage2DArray(output, pos1, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void pre_process_yuv444_copy_U8toF16(\n\
+    __read_only image2d_t            y_img,\n\
+    __read_only image2d_t            u_img,\n\
+    __read_only image2d_t            v_img,\n\
+    __write_only image2d_array_t    output,\n\
+        global int *                xRatio,\n\
+        global int *                yRatio,\n\
+        global int *               xOffset,\n\
+        global int *               yOffset,\n\
+               float                 rMean,\n\
+               float                 gMean,\n\
+               float                 bMean,\n\
+               float                   var,\n\
+               int         reverse_channel,\n\
+               int                   trans\n\
+    )\n\
+{\n\
+    int2 pos = (int2)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset));\n\
+    vxc_uchar16 Y, U, V;\n\
+    vxc_int4 C0, C1, C2, C3;\n\
+    vxc_uchar16 R, G, B;\n\
+    vxc_half8 dst0, dst1, dst2, dst3, dst4, dst5;\n\
+    vxc_short8 out0, out1, out2, out3, out4, out5;\n\
+\n\
+    VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(U, u_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(V, v_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    //C = Y - 16;\n\
+    //D = U - 128;\n\
+    //E = V - 128;\n\
+    // calculate R\n\
+    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]\n\
+    int tmpV = -56992;\n\
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);\n\
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);\n\
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);\n\
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);\n\
+\n\
+    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
+    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
+    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
+    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
+\n\
+    // calculate G\n\
+    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\
+    // 298Y - 208V\n\
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);\n\
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);\n\
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);\n\
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);\n\
+    // 34784 - 100U\n\
+    ushort tmpG = 34784;\n\
+    vxc_ushort8 tmpDstG0, tmpDstG1;\n\
+    VXC_DP2x8(tmpDstG0, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\
+    VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2_2x8);\n\
+\n\
+    VXC_DP4x4(G, C0, tmpDstG0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);\n\
+    VXC_DP4x4(G, C1, tmpDstG0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);\n\
+    VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);\n\
+    VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);\n\
+\n\
+    // calculate B\n\
+    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\
+    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);\n\
+    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);\n\
+    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);\n\
+    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);\n\
+    tmpV = -70688;\n\
+    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
+    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
+    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
+    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
+\n\
+    float4  paramData = (float4)(bMean * var, gMean * var,\\\n\
+        rMean * var, var);\n\
+    half4 paramData_f16;\n\
+    _viv_asm(CONV, paramData_f16, paramData);\n\
+\n\
+    VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);\n\
+    VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);\n\
+\n\
+    VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);\n\
+    VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);\n\
+\n\
+    VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\
+    VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\
+\n\
+    _viv_asm(COPY, out0, dst0, 16);\n\
+    _viv_asm(COPY, out1, dst1, 16);\n\
+    _viv_asm(COPY, out2, dst2, 16);\n\
+    _viv_asm(COPY, out3, dst3, 16);\n\
+    _viv_asm(COPY, out4, dst4, 16);\n\
+    _viv_asm(COPY, out5, dst5, 16);\n\
+\n\
+    int4 pos1 = (int4)(get_global_id(0), get_global_id(1), bOrder, get_global_id(0) + 8);\n\
+    VXC_WriteImage2DArray(output, pos1.xyzz, out0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage2DArray(output, pos1.wyzz, out1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    pos1.z = 1;\n\
+    VXC_WriteImage2DArray(output, pos1.xyzz, out2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage2DArray(output, pos1.wyzz, out3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    pos1.z = rOrder;\n\
+    VXC_WriteImage2DArray(output, pos1.xyzz, out4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage2DArray(output, pos1.wyzz, out5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 "; /* end of pre_process_yuv444_copy_u8_vx*/
 
@@ -29189,6 +33456,465 @@ TENSOR_KERAS_RELU(U8,   U8,   _2D, Image,        U8toF32,   F32toU8,   vxc_uchar
 TENSOR_KERAS_RELU(U8,   F16,  _2D, Image,        U8toF32,   F32toF16,  vxc_uchar8,  vxc_ushort8)\n\
 "; /* end of relu_keras_vx*/
 
+static const char repeat_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniIntegralHorAcc_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract1to8Short_2x8;\n\
+_viv_uniform int width;\n\
+\n\
+// workgroup size is 32\n\
+__kernel void preprocess_start_idx(image2d_t input, image2d_t output)\n\
+{\n\
+    int lidx = get_local_id(0);\n\
+    __local int lcl_sum[32];\n\
+    __local int last_round[1];\n\
+    Image img = create_image_from_image2d(input, 4);\n\
+    Image dst = create_image_from_image2d(output, 4);\n\
+    __global int* index_ptr = (__global int*)img.ptr + get_global_id(0);\n\
+    __global int* output_org = (__global int*)dst.ptr;\n\
+    __global int* output_ptr = output_org + get_global_id(0) + 1;\n\
+\n\
+    if (lidx == 0)\n\
+    {\n\
+        last_round[0] = 0;\n\
+        output_org[0] = 0;\n\
+    }\n\
+    int4 accSum0, accSum1, accSum2, accSum3;\n\
+\n\
+    for(int i = 0; i < width; i += 512)\n\
+    {\n\
+        int4 data0 = vload4(0, index_ptr + i);\n\
+        int4 data1 = vload4(1, index_ptr + i);\n\
+        int4 data2 = vload4(2, index_ptr + i);\n\
+        int4 data3 = vload4(3, index_ptr + i);\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
+        int prevSum = last_round[0];\n\
+\n\
+        VXC_DP4x4(accSum0, data0, data0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniIntegralHorAcc_4x4);\n\
+        VXC_DP4x4(accSum1, data1, data1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniIntegralHorAcc_4x4);\n\
+        VXC_DP4x4(accSum2, data2, data2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniIntegralHorAcc_4x4);\n\
+        VXC_DP4x4(accSum3, data3, data3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniIntegralHorAcc_4x4);\n\
+        accSum1 += accSum0.w;\n\
+        accSum2 += accSum1.w;\n\
+        accSum3 += accSum2.w;\n\
+\n\
+        lcl_sum[lidx] = accSum3.w;\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+        for(int j = 0; j < lidx; j++)\n\
+        {\n\
+            prevSum += lcl_sum[j];\n\
+        }\n\
+        accSum0 += prevSum;\n\
+        accSum1 += prevSum;\n\
+        accSum2 += prevSum;\n\
+        accSum3 += prevSum;\n\
+        if(lidx == 31)\n\
+        {\n\
+            last_round[0] = accSum3.w;\n\
+        }\n\
+        vstore4(accSum0, 0, output_ptr + i);\n\
+        vstore4(accSum1, 1, output_ptr + i);\n\
+        vstore4(accSum2, 2, output_ptr + i);\n\
+        vstore4(accSum3, 3, output_ptr + i);\n\
+    }\n\
+}\n\
+\n\
+__kernel void repeat_I16_axis0(\n\
+    image2d_array_t  input0, image2d_t  input1, image2d_t  input2,\n\
+    image2d_array_t  output, int axis)\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    vxc_short8 src0;\n\
+    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    Image img1 = create_image_from_image2d(input1, 4);\n\
+    Image img2 = create_image_from_image2d(input2, 4);\n\
+    __global int* len_ptr = (__global int*)img1.ptr;\n\
+    __global int* index_ptr = (__global int*)img2.ptr;\n\
+\n\
+    int len = len_ptr[get_global_id(1)];\n\
+    int start = index_ptr[get_global_id(1)];\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr);\n\
+    int end = len + start;\n\
+\n\
+    for(coord.y = start; coord.y < end; coord.y++)\n\
+    {\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, src0, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel void repeat_I16_axis2(\n\
+    image2d_array_t  input0, image2d_t  input1, image2d_t  input2,\n\
+    image2d_array_t  output, int axis)\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    vxc_short8 src0;\n\
+    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
+             VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    Image img1 = create_image_from_image2d(input1, 4);\n\
+    Image img2 = create_image_from_image2d(input2, 4);\n\
+    __global int* len_ptr = (__global int*)img1.ptr;\n\
+    __global int* index_ptr = (__global int*)img2.ptr;\n\
+\n\
+    int len = len_ptr[get_global_id(2)];\n\
+    int start = index_ptr[get_global_id(2)];\n\
+    int end = len + start;\n\
+\n\
+    for(coord.z = start; coord.z < end; coord.z++)\n\
+    {\n\
+        VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+#define REPEAT_1D(src0_type_name, data_type) \\\n\
+__kernel void repeat_##src0_type_name##_1D( \\\n\
+    image2d_t  input0, image2d_t  input1, image2d_t  input2, \\\n\
+    image2d_t  output, int axis) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), 0); \\\n\
+    data_type src0; \\\n\
+    VXC_ReadImage(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+             VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    Image img1 = create_image_from_image2d(input1, 4); \\\n\
+    Image img2 = create_image_from_image2d(input2, 4); \\\n\
+    __global int* len_ptr = (__global int*)img1.ptr; \\\n\
+    __global int* index_ptr = (__global int*)img2.ptr; \\\n\
+    int len = len_ptr[get_global_id(0)]; \\\n\
+    int start = index_ptr[get_global_id(0)]; \\\n\
+ \\\n\
+    int iter = len >> 3; \\\n\
+    int res = len & 7; \\\n\
+    int end = start + iter * 8; \\\n\
+    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8); \\\n\
+    for(coord.x = start; coord.x < end; coord.x+=8) \\\n\
+    { \\\n\
+        VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+ \\\n\
+    if(res == 7) \\\n\
+    { \\\n\
+        VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 6, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+    else if(res == 6) \\\n\
+    { \\\n\
+        VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+    else if(res == 5) \\\n\
+    { \\\n\
+        VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 4, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+    else if(res == 4) \\\n\
+    { \\\n\
+        VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+    else if(res == 3) \\\n\
+    { \\\n\
+        VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+    else if(res == 2) \\\n\
+    { \\\n\
+        VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+    else if(res == 1) \\\n\
+    { \\\n\
+        VXC_WriteImage(output, coord, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+REPEAT_1D(U8,  vxc_uchar16)\n\
+REPEAT_1D(I16, vxc_short8)\n\
+\n\
+__kernel void repeat_U8_axis0(\n\
+    image2d_array_t  input0, image2d_t  input1, image2d_t  input2,\n\
+    image2d_array_t  output, int axis)\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    vxc_uchar16 src0;\n\
+    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
+             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    Image img1 = create_image_from_image2d(input1, 4);\n\
+    Image img2 = create_image_from_image2d(input2, 4);\n\
+    __global int* len_ptr = (__global int*)img1.ptr;\n\
+    __global int* index_ptr = (__global int*)img2.ptr;\n\
+\n\
+    int len = len_ptr[get_global_id(1)];\n\
+    int start = index_ptr[get_global_id(1)];\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr);\n\
+    int end = len + start;\n\
+\n\
+    for(coord.y = start; coord.y < end; coord.y++)\n\
+    {\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, src0, \\\n\
+                VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel void repeat_U8_axis2(\n\
+    image2d_array_t  input0, image2d_t  input1, image2d_t  input2,\n\
+    image2d_array_t  output, int axis)\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    vxc_uchar16 src0;\n\
+    VXC_ReadImage2DArray(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
+             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    Image img1 = create_image_from_image2d(input1, 4);\n\
+    Image img2 = create_image_from_image2d(input2, 4);\n\
+    __global int* len_ptr = (__global int*)img1.ptr;\n\
+    __global int* index_ptr = (__global int*)img2.ptr;\n\
+\n\
+    int len = len_ptr[get_global_id(2)];\n\
+    int start = index_ptr[get_global_id(2)];\n\
+    int end = len + start;\n\
+\n\
+    for(coord.z = start; coord.z < end; coord.z++)\n\
+    {\n\
+        VXC_WriteImage2DArray(output, coord, src0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}"; /* end of repeat_vx*/
+
+static const char repeat_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniExtract1to8Short_2x8;\n\
+\n\
+#define REPEAT_RES(end_pos) \\\n\
+coord.y = gidy; \\\n\
+VXC_OP4_NoDest(img_store_3d, output, coord, src0, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \\\n\
+coord.y++; \\\n\
+VXC_OP4_NoDest(img_store_3d, output, coord, src1, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \\\n\
+coord.y++; \\\n\
+VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \\\n\
+coord.y++; \\\n\
+VXC_OP4_NoDest(img_store_3d, output, coord, src3, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \\\n\
+coord.y++; \\\n\
+VXC_OP4_NoDest(img_store_3d, output, coord, src4, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \\\n\
+coord.y++; \\\n\
+VXC_OP4_NoDest(img_store_3d, output, coord, src5, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \\\n\
+coord.y++; \\\n\
+VXC_OP4_NoDest(img_store_3d, output, coord, src6, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0)); \\\n\
+coord.y++; \\\n\
+VXC_OP4_NoDest(img_store_3d, output, coord, src7, VXC_MODIFIER(0, end_pos, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+__kernel void repeat_I16_axis1(\n\
+    image2d_array_t  input0, image2d_t  input1, image2d_t  input2,\n\
+    image2d_array_t  output, int axis)\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int4 coord = (int4)(get_global_id(0), gidy, get_global_id(2), 0);\n\
+    vxc_short8 src0, src1, src2, src3, src4, src5, src6, src7;\n\
+\n\
+    int8 input_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
+\n\
+    VXC_OP4(img_load_3d, src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src1, input0, coord, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src2, input0, coord, VXC_5BITOFFSET_XY(0, 2),\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src3, input0, coord, VXC_5BITOFFSET_XY(0, 3),\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src4, input0, coord, VXC_5BITOFFSET_XY(0, 4),\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src5, input0, coord, VXC_5BITOFFSET_XY(0, 5),\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src6, input0, coord, VXC_5BITOFFSET_XY(0, 6),\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src7, input0, coord, VXC_5BITOFFSET_XY(0, 7),\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    Image img1 = create_image_from_image2d(input1, 4);\n\
+    Image img2 = create_image_from_image2d(input2, 4);\n\
+    __global int* len_ptr = (__global int*)img1.ptr;\n\
+    __global int* index_ptr = (__global int*)img2.ptr;\n\
+\n\
+    int len = len_ptr[get_global_id(0)];\n\
+    int start = index_ptr[get_global_id(0)];\n\
+\n\
+    _viv_asm(MOV, coord.z, baseAddr);\n\
+    int iter = len >> 3;\n\
+    int res = len & 7;\n\
+    coord.x = start;\n\
+\n\
+    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\
+    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\
+    VXC_DP2x8(src2, src2, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\
+    VXC_DP2x8(src3, src3, src3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\
+    VXC_DP2x8(src4, src4, src4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\
+    VXC_DP2x8(src5, src5, src5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\
+    VXC_DP2x8(src6, src6, src6, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\
+    VXC_DP2x8(src7, src7, src7, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\
+\n\
+    for(int i = 0; i < iter; i++)\n\
+    {\n\
+        coord.y = gidy;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+        coord.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+        coord.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+        coord.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+        coord.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, src4, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+        coord.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, src5, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+        coord.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, src6, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+        coord.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, src7, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+        coord.x += 8;\n\
+    }\n\
+\n\
+    if(res == 7)\n\
+    {\n\
+        REPEAT_RES(6)\n\
+    }\n\
+    else if(res == 6)\n\
+    {\n\
+        REPEAT_RES(5)\n\
+    }\n\
+    else if(res == 5)\n\
+    {\n\
+        REPEAT_RES(4)\n\
+    }\n\
+    else if(res == 4)\n\
+    {\n\
+        REPEAT_RES(3)\n\
+    }\n\
+    else if(res == 3)\n\
+    {\n\
+        REPEAT_RES(2)\n\
+    }\n\
+    else if(res == 2)\n\
+    {\n\
+        REPEAT_RES(1)\n\
+    }\n\
+    else if(res == 1)\n\
+    {\n\
+        REPEAT_RES(0)\n\
+    }\n\
+}\n\
+\n\
+__kernel void repeat_U8_axis1(\n\
+    image2d_array_t  input0, image2d_t  input1, image2d_t  input2,\n\
+    image2d_array_t  output, int axis)\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int4 coord = (int4)(get_global_id(0), gidy, get_global_id(2), 0);\n\
+    vxc_uchar16 src0, src1, src2, src3, src4, src5, src6, src7;\n\
+\n\
+    int8 input_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc));\n\
+    int baseAddr_a = (int)get_global_id(2) * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr_a);\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)get_global_id(2) * output_desc.s4 + output_desc.s0;\n\
+\n\
+    VXC_OP4(img_load_3d, src0, input0, coord, VXC_5BITOFFSET_XY(0, 0),\n\
+             VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src1, input0, coord, VXC_5BITOFFSET_XY(0, 1),\n\
+             VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src2, input0, coord, VXC_5BITOFFSET_XY(0, 2),\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src3, input0, coord, VXC_5BITOFFSET_XY(0, 3),\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src4, input0, coord, VXC_5BITOFFSET_XY(0, 4),\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src5, input0, coord, VXC_5BITOFFSET_XY(0, 5),\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src6, input0, coord, VXC_5BITOFFSET_XY(0, 6),\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src7, input0, coord, VXC_5BITOFFSET_XY(0, 7),\n\
+            VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    Image img1 = create_image_from_image2d(input1, 4);\n\
+    Image img2 = create_image_from_image2d(input2, 4);\n\
+    __global int* len_ptr = (__global int*)img1.ptr;\n\
+    __global int* index_ptr = (__global int*)img2.ptr;\n\
+\n\
+    int len = len_ptr[get_global_id(0)];\n\
+    int start = index_ptr[get_global_id(0)];\n\
+\n\
+    _viv_asm(MOV, coord.z, baseAddr);\n\
+    int iter = len >> 3;\n\
+    int res = len & 7;\n\
+    coord.x = start;\n\
+\n\
+    VXC_DP2x8(src0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\
+    VXC_DP2x8(src1, src1, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\
+    VXC_DP2x8(src2, src2, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\
+    VXC_DP2x8(src3, src3, src3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\
+    VXC_DP2x8(src4, src4, src4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\
+    VXC_DP2x8(src5, src5, src5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\
+    VXC_DP2x8(src6, src6, src6, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\
+    VXC_DP2x8(src7, src7, src7, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract1to8Short_2x8);\n\
+\n\
+    for(int i = 0; i < iter; i++)\n\
+    {\n\
+        coord.y = gidy;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, src0, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+        coord.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, src1, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+        coord.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+        coord.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, src3, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+        coord.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, src4, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+        coord.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, src5, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+        coord.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, src6, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+        coord.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, src7, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+        coord.x += 8;\n\
+    }\n\
+\n\
+    if(res == 7)\n\
+    {\n\
+        REPEAT_RES(6)\n\
+    }\n\
+    else if(res == 6)\n\
+    {\n\
+        REPEAT_RES(5)\n\
+    }\n\
+    else if(res == 5)\n\
+    {\n\
+        REPEAT_RES(4)\n\
+    }\n\
+    else if(res == 4)\n\
+    {\n\
+        REPEAT_RES(3)\n\
+    }\n\
+    else if(res == 3)\n\
+    {\n\
+        REPEAT_RES(2)\n\
+    }\n\
+    else if(res == 2)\n\
+    {\n\
+        REPEAT_RES(1)\n\
+    }\n\
+    else if(res == 1)\n\
+    {\n\
+        REPEAT_RES(0)\n\
+    }\n\
+}\n\
+\n\
+"; /* end of repeat_axis1_vx*/
+
 static const char resize_1d_bilinear_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform float scale_x;\n\
@@ -31997,13 +36723,13 @@ __kernel void resize_bilinear_U8toU8_DOWN\n\
 }\n\
 "; /* end of resize_bilinear_U8_vx*/
 
-static const char resize_bilinear_U8_UP_2X_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char resize_bilinear_U8_half_pixel_centers_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
-_viv_uniform VXC_512Bits uniResize2xUp_4x8;\n\
-_viv_uniform VXC_512Bits uniResize2xUpRound_2x8;\n\
+_viv_uniform VXC_512Bits uniResize2xUp_0_4x8;\n\
+_viv_uniform VXC_512Bits uniResize2xUp_1_4x8;\n\
 _viv_uniform int out_height;\n\
 \n\
-__kernel void resize_bilinear_U8toU8_UP_2X_half\n\
+__kernel void resize_bilinear_U8toU8_SAME_2x_upsample_half_pixel_centers\n\
     (\n\
     __read_only  image2d_array_t   input,\n\
     __write_only image2d_array_t   output,\n\
@@ -32017,7 +36743,6 @@ __kernel void resize_bilinear_U8toU8_UP_2X_half\n\
     coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;\n\
 \n\
     vxc_uchar16 in0, in1, tmp, result;\n\
-    vxc_ushort8 result_s, round_s = 8;\n\
 \n\
     int8 input_desc;\n\
     _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
@@ -32035,34 +36760,199 @@ __kernel void resize_bilinear_U8toU8_UP_2X_half\n\
 \n\
     while (coord_out.y < out_height)\n\
     {\n\
-        VXC_DP4x8(result_s, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8);\n\
-        VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8);\n\
+        VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);\n\
+        VXC_DP4x8(result, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);\n\
         VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\
-            VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
         coord_out.y++;\n\
         VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 2),\n\
             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-        VXC_DP4x8(result_s, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8);\n\
-        VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8);\n\
+        VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);\n\
+        VXC_DP4x8(result, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);\n\
         VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\
-            VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
         coord_out.y++;\n\
-        VXC_DP4x8(result_s, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8);\n\
-        VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8);\n\
+        VXC_DP4x8(result, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);\n\
+        VXC_DP4x8(result, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);\n\
         VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\
-            VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
         VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 3),\n\
             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
         coord_out.y++;\n\
-        VXC_DP4x8(result_s, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUp_4x8);\n\
-        VXC_DP2x8(result, result_s, round_s, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniResize2xUpRound_2x8);\n\
+        VXC_DP4x8(result, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_0_4x8);\n\
+        VXC_DP4x8(result, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize2xUp_1_4x8);\n\
         VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, result,\n\
-            VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
         coord_in.y += 2;\n\
         coord_out.y++;\n\
     }\n\
 }\n\
-"; /* end of resize_bilinear_U8_UP_2X_vx*/
+\n\
+_viv_uniform VXC_512Bits uniResize4xUp_l00_4x8;\n\
+_viv_uniform VXC_512Bits uniResize4xUp_l01_4x8;\n\
+_viv_uniform VXC_512Bits uniResize4xUp_l10_4x8;\n\
+_viv_uniform VXC_512Bits uniResize4xUp_l11_4x8;\n\
+__kernel void resize_bilinear_U8toU8_SAME_4x_upsample_half_pixel_centers\n\
+    (\n\
+    __read_only  image2d_array_t   input,\n\
+    __write_only image2d_array_t   output,\n\
+                             int   align_corners,\n\
+                             int   half_pixel_centers\n\
+    )\n\
+{\n\
+    int4 coord_out  =  (int4)(get_global_id(0), 0, get_global_id(1), 0);\n\
+    int4 coord_in   = (int4)(get_global_id(0), -1, get_global_id(1), 0);\n\
+    coord_in.x = (coord_out.x * 2 - 3) >> 3;\n\
+    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;\n\
+\n\
+    vxc_uchar16 in0, in1, tmp, dst0, dst1;\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+    VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    while (coord_out.y < out_height)\n\
+    {\n\
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l00_4x8);\n\
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);\n\
+        VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);\n\
+        VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);\n\
+        VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 2),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l10_4x8);\n\
+        VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);\n\
+        VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l00_4x8);\n\
+        VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l00_4x8);\n\
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);\n\
+        VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l10_4x8);\n\
+        VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);\n\
+        VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 3),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l10_4x8);\n\
+        VXC_DP4x8(dst0, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l11_4x8);\n\
+        VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize4xUp_l00_4x8);\n\
+        VXC_DP4x8(dst1, in1, in0, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize4xUp_l01_4x8);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.y++;\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,\n\
+            VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+        coord_in.y += 2;\n\
+        coord_out.y++;\n\
+    }\n\
+}\n\
+\n\
+_viv_uniform VXC_512Bits uniResize3xUp_l00_2x8;\n\
+_viv_uniform VXC_512Bits uniResize3xUp_l01_2x8;\n\
+_viv_uniform VXC_512Bits uniResize3xUp_l10_4x4;\n\
+_viv_uniform VXC_512Bits uniResize3xUp_l11_4x4;\n\
+_viv_uniform VXC_512Bits uniResize3xUp_l12_4x4;\n\
+_viv_uniform VXC_512Bits uniResize3xUp_l13_4x4;\n\
+__kernel void resize_bilinear_U8toU8_SAME_3x_upsample_half_pixel_centers\n\
+    (\n\
+    __read_only  image2d_array_t   input,\n\
+    __write_only image2d_array_t   output,\n\
+                             int   align_corners,\n\
+                             int   half_pixel_centers\n\
+    )\n\
+{\n\
+    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_in   = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    coord_in.x = (short)(coord_out.x * 2 - 1) / (short)6;\n\
+    coord_in.x  = coord_out.x == 0 ? -1 : coord_in.x;\n\
+    coord_in.y = (short)(coord_out.y * 2 - 1) / (short)6;\n\
+    coord_in.y  = coord_out.y == 0 ? -1 : coord_in.y;\n\
+\n\
+    vxc_uchar16 in0, in1, in2, in3, tmp, dst0, dst1, dst2;\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+    VXC_OP4(img_load_3d, in0, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, in1, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, in2, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 2),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, in3, input, coord_in.xyww, VXC_5BITOFFSET_XY(0, 3),\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    VXC_DP4x4(dst2, in1, in0, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);\n\
+    VXC_DP4x4(dst2, in1, in0, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);\n\
+    VXC_DP4x4(dst2, in1, in0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);\n\
+    VXC_DP4x4(dst2, in1, in0, VXC_MODIFIER(12, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst2,\n\
+        VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));\n\
+    coord_out.y++;\n\
+\n\
+    VXC_DP2x8(dst0, in1, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l00_2x8);\n\
+    VXC_DP2x8(dst0, in1, in1, VXC_MODIFIER(8, 14, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l01_2x8);\n\
+    VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);\n\
+    VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);\n\
+    VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);\n\
+    VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(12, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);\n\
+    VXC_DP4x4(dst2, in2, in1, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);\n\
+    VXC_DP4x4(dst2, in2, in1, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);\n\
+    VXC_DP4x4(dst2, in2, in1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);\n\
+    VXC_DP4x4(dst2, in2, in1, VXC_MODIFIER(12, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,\n\
+        VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));\n\
+    coord_out.y++;\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,\n\
+        VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));\n\
+    coord_out.y++;\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst2,\n\
+        VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));\n\
+    coord_out.y++;\n\
+\n\
+    VXC_DP2x8(dst0, in2, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l00_2x8);\n\
+    VXC_DP2x8(dst0, in2, in2, VXC_MODIFIER(8, 14, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l01_2x8);\n\
+    VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l10_4x4);\n\
+    VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize3xUp_l11_4x4);\n\
+    VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniResize3xUp_l12_4x4);\n\
+    VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(12, 14, 0, VXC_RM_ToNearestEven, 1), uniResize3xUp_l13_4x4);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst0,\n\
+        VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));\n\
+    coord_out.y++;\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xyww, dst1,\n\
+        VXC_MODIFIER(0, 14, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of resize_bilinear_U8_half_pixel_centers_vx*/
 
 static const char resize_bilinear_U8_opt_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -32832,6 +37722,398 @@ __kernel void select_I8_U8_U8toU8_2D(\n\
 }\n\
 "; /* end of select_vx*/
 
+static const char sequence_mask_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+_viv_uniform float input_scale;\n\
+_viv_uniform int inputZP;\n\
+_viv_uniform int output_ZP;\n\
+_viv_uniform float outputVal1;\n\
+\n\
+#define SEQUENCE_MASK_QINT_TO_QINT_2D(src0_type_name, src1_type_name, read_type, write_type) \\\n\
+__kernel void sequence_mask_##src0_type_name##to##src1_type_name##_2D( \\\n\
+    image2d_t input, image2d_t output, int maxLen) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int2 coord = (int2)(gidx, get_global_id(1)); \\\n\
+    read_type src0; \\\n\
+    VXC_ReadImage(src0, input, coord.yy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3); \\\n\
+    float4 tmpData; \\\n\
+    short zp = inputZP; \\\n\
+    VXC_DP4x4(tmpData, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                 uniConvert1stUint8SubZpToFp32_4x4); \\\n\
+    int index = convert_int_rte(tmpData.s0 * input_scale); \\\n\
+    int4 data; \\\n\
+    data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; \\\n\
+    write_type dst; \\\n\
+    VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\
+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+SEQUENCE_MASK_QINT_TO_QINT_2D(U8,  U8,  vxc_uchar16, vxc_uchar16)\n\
+SEQUENCE_MASK_QINT_TO_QINT_2D(I8,  I8,  vxc_char16,  vxc_char16)\n\
+SEQUENCE_MASK_QINT_TO_QINT_2D(I16, I16, vxc_short8,  vxc_short8)\n\
+SEQUENCE_MASK_QINT_TO_QINT_2D(I8,  U8,  vxc_char16,  vxc_uchar16)\n\
+SEQUENCE_MASK_QINT_TO_QINT_2D(I16, U8,  vxc_short8,  vxc_uchar16)\n\
+\n\
+#define SEQUENCE_MASK_QINT_TO_QINT(src0_type_name, src1_type_name, read_type, write_type) \\\n\
+__kernel void sequence_mask_##src0_type_name##to##src1_type_name( \\\n\
+    image2d_t input, image2d_array_t output, int maxLen) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0); \\\n\
+    read_type src0; \\\n\
+    VXC_ReadImage(src0, input, coord.yz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3); \\\n\
+    float4 tmpData; \\\n\
+    short zp = inputZP; \\\n\
+    VXC_DP4x4(tmpData, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                 uniConvert1stUint8SubZpToFp32_4x4); \\\n\
+    int index = convert_int_rte(tmpData.s0 * input_scale); \\\n\
+    int4 data; \\\n\
+    data = outIdx < index? convert_int_rte(outputVal1) : output_ZP; \\\n\
+    write_type dst; \\\n\
+    VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\
+    VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+SEQUENCE_MASK_QINT_TO_QINT(U8,  U8,  vxc_uchar16, vxc_uchar16)\n\
+SEQUENCE_MASK_QINT_TO_QINT(I8,  I8,  vxc_char16,  vxc_char16)\n\
+SEQUENCE_MASK_QINT_TO_QINT(I16, I16, vxc_short8,  vxc_short8)\n\
+SEQUENCE_MASK_QINT_TO_QINT(I16, U8,  vxc_short8,  vxc_uchar16)\n\
+SEQUENCE_MASK_QINT_TO_QINT(I8,  U8,  vxc_char16,  vxc_uchar16)\n\
+\n\
+__kernel void sequence_mask_F16toF16_2D(\n\
+    image2d_t input, image2d_t output, int maxLen)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int2 coord = (int2)(gidx, get_global_id(1));\n\
+    vxc_short8 src0;\n\
+    vxc_half8 in_h;\n\
+    VXC_ReadImage(src0, input, coord.yy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3);\n\
+    _viv_asm(COPY, in_h, src0, 16);\n\
+    float4 tmpData;\n\
+    VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+    int index = convert_int_rte(tmpData.x);\n\
+    float4 data;\n\
+    data = outIdx < index? outputVal1 : convert_float(output_ZP);\n\
+    vxc_short8 dst;\n\
+    half4 tmpVal;\n\
+    _viv_asm(CONV, tmpVal, data);\n\
+    _viv_asm(COPY, dst, tmpVal, 16);\n\
+    VXC_WriteImage(output, coord, dst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void sequence_mask_F16toF16(\n\
+    image2d_t input, image2d_t output, int maxLen)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0);\n\
+    vxc_short8 src0;\n\
+    vxc_half8 in_h;\n\
+    VXC_ReadImage(src0, input, coord.yz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3);\n\
+    _viv_asm(COPY, in_h, src0, 16);\n\
+    float4 tmpData;\n\
+    VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+    int index = convert_int_rte(tmpData.x);\n\
+    float4 data;\n\
+    data = outIdx < index? outputVal1 : convert_float(output_ZP);\n\
+    vxc_short8 dst;\n\
+    half4 tmpVal;\n\
+    _viv_asm(CONV, tmpVal, data);\n\
+    _viv_asm(COPY, dst, tmpVal, 16);\n\
+    VXC_WriteImage2DArray(output, coord, dst.s0246, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void sequence_mask_F16toU8_2D(\n\
+    image2d_t input, image2d_t output, int maxLen)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int2 coord = (int2)(gidx, get_global_id(1));\n\
+    vxc_short8 src0;\n\
+    vxc_half8 in_h;\n\
+    VXC_ReadImage(src0, input, coord.yy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3);\n\
+    _viv_asm(COPY, in_h, src0, 16);\n\
+    float4 tmpData;\n\
+    VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+    int index = convert_int_rte(tmpData.x);\n\
+    int4 data;\n\
+    data = outIdx < index? convert_int_rte(outputVal1) : output_ZP;\n\
+    vxc_uchar16 dst;\n\
+    VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void sequence_mask_F16toU8(\n\
+    image2d_t input, image2d_t output, int maxLen)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0);\n\
+    vxc_short8 src0;\n\
+    vxc_half8 in_h;\n\
+    VXC_ReadImage(src0, input, coord.yz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    int4 outIdx = (int4)(gidx, gidx + 1, gidx + 2, gidx + 3);\n\
+    _viv_asm(COPY, in_h, src0, 16);\n\
+    float4 tmpData;\n\
+    VXC_DP4x4(tmpData, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+    int index = convert_int_rte(tmpData.x);\n\
+    int4 data;\n\
+    data = outIdx < index? convert_int_rte(outputVal1) : output_ZP;\n\
+    vxc_uchar16 dst;\n\
+    VXC_DP2x8(dst, data, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+"; /* end of sequence_mask_vx*/
+
+static const char slice_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+#define SLICE_SAMLEFL_SH_IMPL(name, data_type, end_bin) \\\n\
+__kernel void slice_##name##_I32to##name##_SAMEFL \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_t       input1, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    int is_samefl \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    int4 coord_in; \\\n\
+    Image begin_img = create_image_from_image2d(input1, 4); \\\n\
+    uchar* begin_ptr = begin_img.ptr; \\\n\
+    int4 begin = ((int4 *)begin_ptr)[0]; \\\n\
+    \\\n\
+    coord_in = coord + begin; \\\n\
+    data_type src; \\\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, end_bin, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, end_bin, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+SLICE_SAMLEFL_SH_IMPL(U8, vxc_uchar16, 15)\n\
+SLICE_SAMLEFL_SH_IMPL(I16, vxc_short8, 7)\n\
+\n\
+\n\
+#define SLICE_SAMLEFL_2D_SH_IMPL(name, data_type, end_bin) \\\n\
+__kernel void slice_##name##_I32to##name##_SAMEFL_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_t       input1, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    int is_samefl \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    int2 coord_in; \\\n\
+    Image begin_img = create_image_from_image2d(input1, 4); \\\n\
+    uchar* begin_ptr = begin_img.ptr; \\\n\
+    int2 begin = ((int2 *)begin_ptr)[0]; \\\n\
+    \\\n\
+    coord_in = coord + begin; \\\n\
+    data_type src; \\\n\
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, end_bin, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, end_bin, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+SLICE_SAMLEFL_2D_SH_IMPL(U8, vxc_uchar16, 15)\n\
+SLICE_SAMLEFL_2D_SH_IMPL(I16, vxc_short8, 7)\n\
+\n\
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_Lo_2x8;\n\
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_Hi_2x8;\n\
+_viv_uniform int2 multAndoutZP;//[0:15] multiplier, [31:63] output zp\n\
+#define SLICE_8BITSTO16BITS(name0, name1, src_type, dst_type, save_type) \\\n\
+__kernel void slice_##name0##_I32to##name1 \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_t       input1, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    int is_samefl \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    src_type src; \\\n\
+    dst_type dst0; \\\n\
+    int4 coord_in; \\\n\
+    Image begin_img = create_image_from_image2d(input1, 4); \\\n\
+    uchar* begin_ptr = begin_img.ptr; \\\n\
+    int4 begin = ((int4 *)begin_ptr)[0]; \\\n\
+    \\\n\
+    coord_in = coord + begin; \\\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    vxc_ushort8 multiplier; \\\n\
+    _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\
+    VXC_DP2x8(dst0, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift_Lo_2x8); \\\n\
+    save_type dst; \\\n\
+    _viv_asm(COPY, dst, dst0, 16); \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+SLICE_8BITSTO16BITS(I8, F16, vxc_char16,  vxc_half8,  vxc_short8)\n\
+SLICE_8BITSTO16BITS(U8, F16, vxc_uchar16, vxc_half8,  vxc_short8)\n\
+\n\
+#define SLICE_8BITSTO16BITS_2D(name0, name1, src_type, dst_type, save_type) \\\n\
+__kernel void slice_##name0##_I32to##name1##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_t       input1, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    int is_samefl \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    src_type src; \\\n\
+    dst_type dst0; \\\n\
+    int2 coord_in; \\\n\
+    Image begin_img = create_image_from_image2d(input1, 4); \\\n\
+    uchar* begin_ptr = begin_img.ptr; \\\n\
+    int2 begin = ((int2 *)begin_ptr)[0]; \\\n\
+    \\\n\
+    coord_in = coord + begin; \\\n\
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    vxc_ushort8 multiplier; \\\n\
+    _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\
+    VXC_DP2x8(dst0, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift_Lo_2x8); \\\n\
+    save_type dst; \\\n\
+    _viv_asm(COPY, dst, dst0, 16); \\\n\
+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+SLICE_8BITSTO16BITS_2D(I8, F16, vxc_char16,  vxc_half8,  vxc_short8)\n\
+SLICE_8BITSTO16BITS_2D(U8, F16, vxc_uchar16, vxc_half8,  vxc_short8)\n\
+\n\
+#define SLICE_8BITSTO8BITS(name0, name1, src_type, dst_type) \\\n\
+__kernel void slice_##name0##_I32to##name1 \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_t       input1, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    int is_samefl \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+    int4 coord_in; \\\n\
+    Image begin_img = create_image_from_image2d(input1, 4); \\\n\
+    uchar* begin_ptr = begin_img.ptr; \\\n\
+    int4 begin = ((int4 *)begin_ptr)[0]; \\\n\
+    \\\n\
+    coord_in = coord + begin; \\\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    vxc_ushort8 multiplier; \\\n\
+    _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\
+    VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift_Lo_2x8); \\\n\
+    VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift_Hi_2x8); \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+SLICE_8BITSTO8BITS(I8, I8, vxc_char16,  vxc_char16)\n\
+SLICE_8BITSTO8BITS(U8, U8, vxc_uchar16, vxc_uchar16)\n\
+\n\
+#define SLICE_8BITSTO8BITS_2D(name0, name1, src_type, dst_type) \\\n\
+__kernel void slice_##name0##_I32to##name1##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_t       input1, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    int is_samefl \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+    int2 coord_in; \\\n\
+    Image begin_img = create_image_from_image2d(input1, 4); \\\n\
+    uchar* begin_ptr = begin_img.ptr; \\\n\
+    int2 begin = ((int2 *)begin_ptr)[0]; \\\n\
+    \\\n\
+    coord_in = coord + begin; \\\n\
+    VXC_ReadImage(src, input0, coord_in, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    vxc_ushort8 multiplier; \\\n\
+    _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\
+    VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift_Lo_2x8); \\\n\
+    VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift_Hi_2x8); \\\n\
+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+SLICE_8BITSTO8BITS_2D(I8, I8, vxc_char16,  vxc_char16)\n\
+SLICE_8BITSTO8BITS_2D(U8, U8, vxc_uchar16, vxc_uchar16)\n\
+\n\
+#define SLICE_16BITS_TO(name0, name1, src_type, copy_type, dst_type) \\\n\
+__kernel void slice_##name0##_I32to##name1 \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_t       input1, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    int is_samefl \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    src_type src; \\\n\
+    copy_type src0; \\\n\
+    dst_type dst; \\\n\
+    int4 coord_in; \\\n\
+    Image begin_img = create_image_from_image2d(input1, 4); \\\n\
+    uchar* begin_ptr = begin_img.ptr; \\\n\
+    int4 begin = ((int4 *)begin_ptr)[0]; \\\n\
+    \\\n\
+    coord_in = coord + begin; \\\n\
+    VXC_ReadImage2DArray(src0, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src, src0, 16); \\\n\
+ \\\n\
+    vxc_ushort8 multiplier; \\\n\
+    _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\
+    VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift_Lo_2x8); \\\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+SLICE_16BITS_TO(F16, I8,  vxc_half8,  vxc_short8, vxc_char16)\n\
+SLICE_16BITS_TO(F16, U8,  vxc_half8,  vxc_short8, vxc_uchar16)\n\
+SLICE_16BITS_TO(F16, I16, vxc_half8,  vxc_short8, vxc_short8)\n\
+\n\
+#define SLICE_16BITS_TO_2D(name0, name1, src_type, copy_type, dst_type) \\\n\
+__kernel void slice_##name0##_I32to##name1##_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input0, \\\n\
+    __read_only  image2d_t       input1, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    int is_samefl \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    src_type src; \\\n\
+    copy_type src0; \\\n\
+    dst_type dst; \\\n\
+    int2 coord_in; \\\n\
+    Image begin_img = create_image_from_image2d(input1, 4); \\\n\
+    uchar* begin_ptr = begin_img.ptr; \\\n\
+    int2 begin = ((int2 *)begin_ptr)[0]; \\\n\
+    \\\n\
+    coord_in = coord + begin; \\\n\
+    VXC_ReadImage(src0, input0, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src, src0, 16); \\\n\
+ \\\n\
+    vxc_ushort8 multiplier; \\\n\
+    _viv_asm(COPY, multiplier, multAndoutZP, 16); \\\n\
+    VXC_DP2x8(dst, src, multiplier, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+        uniU8MulAndPostShift_Lo_2x8); \\\n\
+    VXC_WriteImage(output, coord.xy, dst, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+SLICE_16BITS_TO_2D(F16, I8,  vxc_half8,  vxc_short8, vxc_char16)\n\
+SLICE_16BITS_TO_2D(F16, U8,  vxc_half8,  vxc_short8, vxc_uchar16)\n\
+SLICE_16BITS_TO_2D(F16, I16, vxc_half8,  vxc_short8, vxc_short8)"; /* end of slice_vx*/
+
 static const char space2depth_internal_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniExtractEvenUint8Stride2_2x8;\n\
@@ -33245,6 +38527,43 @@ TILE_2D(I16, I16, 6, 5, vxc_short8)\n\
 TILE_2D(I16, I16, 7, 6, vxc_short8)\n\
 TILE_2D(I16, I16, 0, 7, vxc_short8)\n\
 \n\
+#define TILE_2D_1TON(name0, name1, type) \\\n\
+__kernel void tile_1toN_##name0##to##name1##_2D( \\\n\
+    __read_only image2d_array_t  input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                             int batchIn, \\\n\
+                             int depthIn, \\\n\
+                             int depthOut, \\\n\
+                             int multiples_0, \\\n\
+                             int multiples_1, \\\n\
+                             int multiples_2, \\\n\
+                             int multiples_3 \\\n\
+) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    int width = get_image_width(input); \\\n\
+    int height = get_image_height(input); \\\n\
+    int output_width = get_image_width(output); \\\n\
+    int output_height = get_image_height(output); \\\n\
+    type src; \\\n\
+    VXC_ReadImage(src, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    do \\\n\
+    { \\\n\
+        do \\\n\
+        { \\\n\
+            VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.x += 8; \\\n\
+        } while (coord.x < output_width); \\\n\
+        coord.x = 0; \\\n\
+        coord.y += height; \\\n\
+    } while (coord.y < output_height); \\\n\
+}\n\
+TILE_2D_1TON(U8,  U8, vxc_uchar8)\n\
+TILE_2D_1TON(I16, I16, vxc_short8)\n\
+\n\
+\n\
 \n\
 "; /* end of tile_vx*/
 
@@ -34604,16 +39923,6 @@ UPSAMPLE_SCALETO16B_FUN(I16, F16,  vxc_short8,  vxc_short8,  vxc_half8,  vxc_sho
 UPSAMPLE_SCALETO16B_FUN(I16, I16,  vxc_short8,  vxc_short8,  vxc_short8, vxc_short8)\n\
 "; /* end of upsamplescale_k2_vx*/
 
-static const char vsi_nn_kernel_axis_aligned_bbox_transform_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-__kernel void vxcAxis_aligned_bbox_transform(\n\
-    __read_only image2d_array_t   input,\n\
-    __write_only image2d_array_t  output)\n\
-{\n\
-\n\
-}\n\
-"; /* end of vsi_nn_kernel_axis_aligned_bbox_transform_vx*/
-
 static const char vsi_nn_kernel_box_with_nms_limit_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 __kernel void vxcBox_with_nms_limit(\n\
@@ -34679,16 +39988,6 @@ __kernel void vxcExtra_ending_u8(\n\
 }\n\
 "; /* end of vsi_nn_kernel_extra_ending_vx*/
 
-static const char vsi_nn_kernel_generate_proposals_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-__kernel void vxcGenerate_proposals(\n\
-    __read_only image2d_array_t   input,\n\
-    __write_only image2d_array_t  output)\n\
-{\n\
-\n\
-}\n\
-"; /* end of vsi_nn_kernel_generate_proposals_vx*/
-
 static const char vsi_nn_kernel_header_vx[] = "/*\n\
  ============================================================================\n\
  Name        : libNNExt.vx\n\
@@ -34700,6 +39999,62 @@ static const char vsi_nn_kernel_header_vx[] = "/*\n\
  */\n\
 #include \"cl_viv_vx_ext.h\"\n\
 \n\
+typedef struct Image\n\
+{\n\
+    __global uchar *ptr;\n\
+    int             stride_x;\n\
+    int             stride_y;\n\
+} Image;\n\
+\n\
+inline uchar* get_image_ptr_from_coord(Image img, int2 coord)\n\
+{\n\
+    return img.ptr + coord.x * img.stride_x + coord.y * img.stride_y;\n\
+}\n\
+\n\
+inline Image create_image_from_image2d(image2d_t input, int stride_x)\n\
+{\n\
+    int8 desc;\n\
+    _viv_asm(COPY, desc, input, sizeof(desc));\n\
+\n\
+    Image img =\n\
+    {\n\
+        .ptr                           = (uchar*)desc.s0,\n\
+        .stride_x                      = stride_x,\n\
+        .stride_y                      = desc.s1\n\
+    };\n\
+\n\
+    return img;\n\
+}\n\
+\n\
+typedef struct Tensor\n\
+{\n\
+    __global uchar *ptr;\n\
+    int             stride_x;\n\
+    int             stride_y;\n\
+    int             stride_z;\n\
+} Tensor;\n\
+\n\
+inline uchar* create_tensor_ptr_from_coord(Tensor t, int4 coord)\n\
+{\n\
+    return t.ptr + coord.x * t.stride_x + coord.y * t.stride_y + coord.z * t.stride_z;\n\
+}\n\
+\n\
+inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride_x)\n\
+{\n\
+    int8 desc;\n\
+    _viv_asm(COPY, desc, input, sizeof(desc));\n\
+\n\
+    Tensor t =\n\
+    {\n\
+        .ptr                           = (uchar*)desc.s0,\n\
+        .stride_x                      = stride_x,\n\
+        .stride_y                      = desc.s1,\n\
+        .stride_z                      = desc.s4\n\
+    };\n\
+\n\
+    return t;\n\
+}\n\
+\n\
 #if (VX_VERSION==1)\n\
 #define VXC_DP2x8_b_(dst, src0, src1, src2, info, uniform)\\\n\
 do\\\n\
@@ -36666,16 +42021,6 @@ __kernel void vxcTensorStackConcat8Bits(\n\
     VXC_WriteImage2DArray(output, coord, src1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
 }"; /* end of vsi_nn_kernel_tensorstackconcat_vx*/
 
-static const char vsi_nn_kernel_topk_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-__kernel void vxcTopk(\n\
-    __read_only image2d_array_t   input,\n\
-    __write_only image2d_array_t  output)\n\
-{\n\
-\n\
-}\n\
-"; /* end of vsi_nn_kernel_topk_vx*/
-
 static const char vsi_nn_kernel_transform_gemm_vx[] = "/*\n\
  ============================================================================\n\
  Name        : gemm.vx\n\
@@ -38334,6 +43679,62 @@ __kernel void detect_post_box_U8_U8toF32(\n\
 static const char eltwise_ops_helper_cl[] = "#pragma OPENCL EXTENSION CL_VIV_asm : enable\n\
 #pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
 \n\
+typedef struct Image\n\
+{\n\
+    __global uchar *ptr;\n\
+    int             stride_x;\n\
+    int             stride_y;\n\
+} Image;\n\
+\n\
+inline uchar* get_image_ptr_from_coord(Image img, int2 coord)\n\
+{\n\
+    return img.ptr + coord.x * img.stride_x + coord.y * img.stride_y;\n\
+}\n\
+\n\
+inline Image create_image_from_image2d(image2d_t input, int stride_x)\n\
+{\n\
+    int8 desc;\n\
+    _viv_asm(COPY, desc, input, sizeof(desc));\n\
+\n\
+    Image img =\n\
+    {\n\
+        .ptr                           = (uchar*)desc.s0,\n\
+        .stride_x                      = stride_x,\n\
+        .stride_y                      = desc.s1\n\
+    };\n\
+\n\
+    return img;\n\
+}\n\
+\n\
+typedef struct Tensor\n\
+{\n\
+    __global uchar *ptr;\n\
+    int             stride_x;\n\
+    int             stride_y;\n\
+    int             stride_z;\n\
+} Tensor;\n\
+\n\
+inline uchar* create_tensor_ptr_from_coord(Tensor t, int4 coord)\n\
+{\n\
+    return t.ptr + coord.x * t.stride_x + coord.y * t.stride_y + coord.z * t.stride_z;\n\
+}\n\
+\n\
+inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride_x)\n\
+{\n\
+    int8 desc;\n\
+    _viv_asm(COPY, desc, input, sizeof(desc));\n\
+\n\
+    Tensor t =\n\
+    {\n\
+        .ptr                           = (uchar*)desc.s0,\n\
+        .stride_x                      = stride_x,\n\
+        .stride_y                      = desc.s1,\n\
+        .stride_z                      = desc.s4\n\
+    };\n\
+\n\
+    return t;\n\
+}\n\
+\n\
 #define readImage2DArray(Dest, Image, Coord)         \\\n\
     do {                                                       \\\n\
        int8 desc;                                              \\\n\
@@ -38431,6 +43832,11 @@ float4 eltwise_unary_mish(float4 x, float alpha)\n\
     return x;\n\
 }\n\
 \n\
+float4 eltwise_unary_round(float4 x, float alpha)\n\
+{\n\
+    return convert_float4(convert_int4_rte(x));\n\
+}\n\
+\n\
 #define ELTWISE_UNARY_F32(func_name) \\\n\
 __kernel void func_name##_F32toF32 \\\n\
     ( \\\n\
@@ -38458,6 +43864,7 @@ ELTWISE_UNARY_F32(elu)\n\
 ELTWISE_UNARY_F32(neg)\n\
 ELTWISE_UNARY_F32(mish)\n\
 ELTWISE_UNARY_F32(hard_sigmoid)\n\
+ELTWISE_UNARY_F32(round)\n\
 \n\
 #define ELTWISE_UNARY_F32_2D(func_name) \\\n\
 __kernel void func_name##_F32toF32_2D \\\n\
@@ -38486,6 +43893,7 @@ ELTWISE_UNARY_F32_2D(elu)\n\
 ELTWISE_UNARY_F32_2D(neg)\n\
 ELTWISE_UNARY_F32_2D(mish)\n\
 ELTWISE_UNARY_F32_2D(hard_sigmoid)\n\
+ELTWISE_UNARY_F32_2D(round)\n\
 \n\
 #define ELTWISE_UNARY_U8(func_name) \\\n\
 __kernel void func_name##_U8toU8 \\\n\
@@ -38516,6 +43924,7 @@ ELTWISE_UNARY_U8(elu)\n\
 ELTWISE_UNARY_U8(neg)\n\
 ELTWISE_UNARY_U8(mish)\n\
 ELTWISE_UNARY_U8(hard_sigmoid)\n\
+ELTWISE_UNARY_U8(round)\n\
 \n\
 #define ELTWISE_UNARY_U8_2D(func_name) \\\n\
 __kernel void func_name##_U8toU8_2D \\\n\
@@ -38546,7 +43955,7 @@ ELTWISE_UNARY_U8_2D(elu)\n\
 ELTWISE_UNARY_U8_2D(neg)\n\
 ELTWISE_UNARY_U8_2D(mish)\n\
 ELTWISE_UNARY_U8_2D(hard_sigmoid)\n\
-\n\
+ELTWISE_UNARY_U8_2D(round)\n\
 \n\
 __kernel void neg_I32toI32\n\
     (\n\
@@ -38587,6 +43996,121 @@ __kernel void neg_I32toI32_2D\n\
 }\n\
 "; /* end of eltwise_unary_cl*/
 
+static const char erf_cl[] = "#define MUL2_RSQRTPI    (1.1283791670955126f)\n\
+float eltwise_unary_erf(float x)\n\
+{\n\
+    float res = 0;\n\
+    float tmp = x;\n\
+    float factorial = 1;\n\
+    float x_pow = x;\n\
+    float one = 1.0f;\n\
+    float n = 1;\n\
+\n\
+    while (fabs(tmp) > 1e-5)\n\
+    {\n\
+        res += tmp;\n\
+\n\
+        factorial *= n;\n\
+        one *= -1;\n\
+        x_pow *= x * x;\n\
+        tmp = one / factorial * x_pow / ( 2 * n + 1);\n\
+\n\
+        n += 1.0f;\n\
+    }\n\
+    return res * MUL2_RSQRTPI;\n\
+}\n\
+\n\
+#define ELTWISE_UNARY_F32(func_name) \\\n\
+__kernel void func_name##_F32toF32 \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           inputScale, \\\n\
+                 float           inputTail, \\\n\
+                 float           outputScale, \\\n\
+                 float           outputZP \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    float4 src = read_imagef(input, coord); \\\n\
+ \\\n\
+    float4 dst = 0; \\\n\
+    dst.x = eltwise_unary_##func_name(src.x); \\\n\
+ \\\n\
+    write_imagef(output, coord, dst); \\\n\
+}\n\
+ELTWISE_UNARY_F32(erf)\n\
+\n\
+#define ELTWISE_UNARY_F32_2D(func_name) \\\n\
+__kernel void func_name##_F32toF32_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_t input, \\\n\
+    __write_only image2d_t output, \\\n\
+                 float     inputScale, \\\n\
+                 float     inputTail, \\\n\
+                 float     outputScale, \\\n\
+                 float     outputZP \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    float4 src = read_imagef(input, coord); \\\n\
+ \\\n\
+    float4 dst = 0; \\\n\
+    dst.x = eltwise_unary_##func_name(src.x); \\\n\
+ \\\n\
+    write_imagef(output, coord, dst); \\\n\
+}\n\
+ELTWISE_UNARY_F32_2D(erf)\n\
+\n\
+#define ELTWISE_UNARY_U8(func_name) \\\n\
+__kernel void func_name##_U8toU8 \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 float           inputScale, \\\n\
+                 float           inputTail, \\\n\
+                 float           outputScale, \\\n\
+                 float           outputZP \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    uint4 src = read_imageui(input, coord); \\\n\
+    float4 data = convert_float4(src) * inputScale - inputTail; \\\n\
+ \\\n\
+    data.x = eltwise_unary_##func_name(data.x); \\\n\
+    uint4 dst = convert_uint4(data * outputScale + outputZP); \\\n\
+ \\\n\
+    write_imageui(output, coord, dst); \\\n\
+}\n\
+ELTWISE_UNARY_U8(erf)\n\
+\n\
+#define ELTWISE_UNARY_U8_2D(func_name) \\\n\
+__kernel void func_name##_U8toU8_2D \\\n\
+    ( \\\n\
+    __read_only  image2d_t input, \\\n\
+    __write_only image2d_t output, \\\n\
+                 float     inputScale, \\\n\
+                 float     inputTail, \\\n\
+                 float     outputScale, \\\n\
+                 float     outputZP \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    uint4 src = read_imageui(input, coord); \\\n\
+    float4 data = convert_float4(src) * inputScale - inputTail; \\\n\
+ \\\n\
+    data.x = eltwise_unary_##func_name(data.x); \\\n\
+    uint4 dst = convert_uint4(data * outputScale + outputZP); \\\n\
+ \\\n\
+    write_imageui(output, coord, dst); \\\n\
+}\n\
+ELTWISE_UNARY_U8_2D(erf)\n\
+"; /* end of erf_cl*/
+
 static const char floordiv_cl[] = "__kernel void floordiv_F32F32toF32(\n\
     __read_only  image2d_array_t  input,\n\
     __read_only  image2d_array_t  input1,\n\
@@ -38639,6 +44163,44 @@ __kernel void floordiv_I32I32toI32_2D(\n\
     write_imagei(output, coord, dst);\n\
 }\n\
 \n\
+__kernel void floordiv_I32I32toU8(\n\
+    __read_only  image2d_array_t  input,\n\
+    __read_only  image2d_array_t  input1,\n\
+    __write_only image2d_array_t  output,\n\
+                 float            input0Scale,\n\
+                 float            input0Tail,\n\
+                 float            input1Scale,\n\
+                 float            input1Tail,\n\
+                 float            outputScale,\n\
+                 float            outputTail )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 src0;\n\
+    int4 src1;\n\
+    readImage2DArray(src0, input, coord);\n\
+    readImage2DArray(src1, input1, coord);\n\
+    uint4 dst  = convert_uint4(floor(convert_float4(src0) / convert_float4(src1)) * outputScale + outputTail);\n\
+    write_imageui(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void floordiv_I32I32toU8_2D(\n\
+    __read_only  image2d_t  input,\n\
+    __read_only  image2d_t  input1,\n\
+    __write_only image2d_t  output,\n\
+                 float      input0Scale,\n\
+                 float      input0Tail,\n\
+                 float      input1Scale,\n\
+                 float      input1Tail,\n\
+                 float      outputScale,\n\
+                 float      outputTail )\n\
+{\n\
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
+    int4 src0 = read_imagei(input, coord);\n\
+    int4 src1 = read_imagei(input1, coord);\n\
+    uint4 dst  = convert_uint4(floor(convert_float4(src0) / convert_float4(src1)) * outputScale + outputTail);\n\
+    write_imageui(output, coord, dst);\n\
+}\n\
+\n\
 __kernel void floordiv_U8U8toU8(\n\
     __read_only  image2d_array_t  input,\n\
     __read_only  image2d_array_t  input1,\n\
@@ -38683,6 +44245,52 @@ __kernel void floordiv_U8U8toU8_2D(\n\
     uint4 dst  = convert_uint4(out);\n\
     write_imageui(output, coord, dst);\n\
 }\n\
+\n\
+__kernel void floordiv_U8I32toU8(\n\
+    __read_only  image2d_array_t  input,\n\
+    __read_only  image2d_array_t  input1,\n\
+    __write_only image2d_array_t  output,\n\
+                 float            input0Scale,\n\
+                 float            input0Tail,\n\
+                 float            input1Scale,\n\
+                 float            input1Tail,\n\
+                 float            outputScale,\n\
+                 float            outputTail )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    uint4 src0;\n\
+    int4 src1;\n\
+    float4 in0, in1, out;\n\
+    readImage2DArray(src0, input, coord);\n\
+    readImage2DArray(src1, input1, coord);\n\
+    in0 = convert_float4(src0) * input0Scale + input0Tail;\n\
+    in1 = convert_float4(src1);\n\
+    out = floor(in0 / in1) * outputScale + outputTail;\n\
+    uint4 dst = convert_uint4(out);\n\
+    write_imageui(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void floordiv_U8I32toU8_2D(\n\
+    __read_only  image2d_t  input,\n\
+    __read_only  image2d_t  input1,\n\
+    __write_only image2d_t  output,\n\
+                 float      input0Scale,\n\
+                 float      input0Tail,\n\
+                 float      input1Scale,\n\
+                 float      input1Tail,\n\
+                 float      outputScale,\n\
+                 float      outputTail )\n\
+{\n\
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
+    uint4 src0 = read_imageui(input, coord);\n\
+    int4 src1 = read_imagei(input1, coord);\n\
+    float4 in0, in1, out;\n\
+    in0 = convert_float4(src0) * input0Scale + input0Tail;\n\
+    in1 = convert_float4(src1);\n\
+    out = floor(in0 / in1) * outputScale + outputTail;\n\
+    uint4 dst = convert_uint4(out);\n\
+    write_imageui(output, coord, dst);\n\
+}\n\
 "; /* end of floordiv_cl*/
 
 static const char gather_cl[] = "__kernel void gather_U8toU8(\n\
@@ -39036,6 +44644,825 @@ __kernel void gather_nd_F32toF32_3D(\n\
 }\n\
 "; /* end of gather_nd_3d_cl*/
 
+static const char group_normalization_f32_cl[] = "__kernel void group_norm_sumsqr_F32(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_t  output,\n\
+    float eps,\n\
+    int is2d,\n\
+    float input_zp,\n\
+    float input_scale,\n\
+    int width,\n\
+    int height\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int lidx = get_local_id(0);\n\
+\n\
+    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
+    float4 data;\n\
+    float sum = 0, sqr = 0;\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    if(gidx < width)\n\
+    {\n\
+        for(coord.y = 0; coord.y < height;)\n\
+        {\n\
+            data = read_imagef(input, coord);\n\
+            coord.y++;\n\
+            sum += data.x;\n\
+            sqr += data.x * data.x;\n\
+        }\n\
+    }\n\
+    lcl_sum[lidx] = sum;\n\
+    lcl_sqr[lidx] = sqr;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+\n\
+        sum = 0; sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+\n\
+        float4 dst = (float4)(0);\n\
+        dst.x = sum;\n\
+        write_imagef(output, coord_out.xy, dst);\n\
+        coord_out.x++;\n\
+        dst.x = sqr;\n\
+        write_imagef(output, coord_out.xy, dst);\n\
+    }\n\
+}\n\
+\n\
+__kernel void group_norm_sumsqr_F32_2D(\n\
+    __read_only image2d_t   input,\n\
+    __write_only image2d_t  output,\n\
+    float eps,\n\
+    int is2d,\n\
+    float input_zp,\n\
+    float input_scale,\n\
+    int width,\n\
+    int height\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int lidx = get_local_id(0);\n\
+\n\
+    int2 coord = (int2)(gidx, gidz);\n\
+    float4 data;\n\
+    float sum = 0, sqr = 0;\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    if(gidx < width)\n\
+    {\n\
+        data = read_imagef(input, coord);\n\
+        sum = data.x;\n\
+        sqr = data.x * data.x;\n\
+    }\n\
+    lcl_sum[lidx] = sum;\n\
+    lcl_sqr[lidx] = sqr;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+\n\
+        sum = 0; sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+\n\
+        float4 dst = (float4)(0);\n\
+        dst.x = sum;\n\
+        write_imagef(output, coord_out.xy, dst);\n\
+        coord_out.x++;\n\
+        dst.x = sqr;\n\
+        write_imagef(output, coord_out.xy, dst);\n\
+    }\n\
+}\n\
+\n\
+__kernel void group_norm_meanvari(\n\
+    __read_only image2d_t   input,\n\
+    __write_only image2d_t  output,\n\
+    float eps,\n\
+    float group_ratio,\n\
+    int group_stride\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int lidx = get_local_id(0);\n\
+    int2 coord = (int2)(gidx, get_global_id(1));\n\
+\n\
+    float2 sum_sqr = (float2)(0);\n\
+    float4 mean_vari = (float4)(0);\n\
+\n\
+    __local float2 lcl_data[16];\n\
+    __local float2 lcl_sum[4];\n\
+\n\
+    for(; coord.x < group_stride;)\n\
+    {\n\
+        mean_vari.x += read_imagef(input, coord).x;\n\
+        coord.x++;\n\
+        mean_vari.y += read_imagef(input, coord).x;\n\
+        coord.x+=63;\n\
+    }\n\
+    lcl_data[lidx] = mean_vari.xy;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+    if(lidx < 4)\n\
+    {\n\
+        float2 tmpSum = (float2)(0);\n\
+        for(int i = lidx; i < 16; i+=4)\n\
+        {\n\
+            tmpSum += lcl_data[i];\n\
+        }\n\
+        lcl_sum[lidx] = tmpSum;\n\
+    }\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+    if(lidx == 0)\n\
+    {\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum_sqr += lcl_sum[i];\n\
+        }\n\
+        mean_vari.xy = sum_sqr * group_ratio;\n\
+        mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+        mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+\n\
+        coord.x = 0;\n\
+        write_imagef(output, coord, mean_vari);\n\
+        coord.x++;\n\
+        float4 data;\n\
+        data.x = mean_vari.y;\n\
+        write_imagef(output, coord, data);\n\
+    }\n\
+}\n\
+\n\
+__kernel void group_norm_F32toF32(\n\
+    __read_only image2d_array_t   input,\n\
+    __read_only image2d_t   bias,\n\
+    __read_only image2d_t   scale,\n\
+    __read_only image2d_t   meanVari,\n\
+    __write_only image2d_array_t  output,\n\
+    float eps,\n\
+    int is2d,\n\
+    float input_zp,\n\
+    float input_scale,\n\
+    float output_zp,\n\
+    float output_scale,\n\
+    float rSpaceOrg,\n\
+    int width,\n\
+    int height,\n\
+    int pStride\n\
+    )\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 1);\n\
+\n\
+    float4 gamma = read_imagef(scale, coord_para.xy);\n\
+    float4 beta  = read_imagef(bias, coord_para.xy);\n\
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    mean_vari.y = read_imagef(meanVari, coord_para.wy).x;\n\
+    float4 data = read_imagef(input, coord);\n\
+\n\
+    float scale_vari, bias_val;\n\
+\n\
+    scale_vari = gamma.s0 * mean_vari.s1;\n\
+    bias_val = (beta.s0 - scale_vari * mean_vari.s0);\n\
+\n\
+    float4 dst;\n\
+\n\
+    dst.x = data.x * scale_vari + bias_val;\n\
+    write_imagef(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void group_norm_F32toF32_2D(\n\
+    __read_only image2d_t   input,\n\
+    __read_only image2d_t   bias,\n\
+    __read_only image2d_t   scale,\n\
+    __read_only image2d_t   meanVari,\n\
+    __write_only image2d_t  output,\n\
+    float eps,\n\
+    int is2d,\n\
+    float input_zp,\n\
+    float input_scale,\n\
+    float output_zp,\n\
+    float output_scale,\n\
+    float rSpaceOrg,\n\
+    int width,\n\
+    int height,\n\
+    int pStride\n\
+    )\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int2 coord = (int2)(get_global_id(0), gidz);\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 1);\n\
+\n\
+    float4 gamma = read_imagef(scale, coord_para.xy);\n\
+    float4 beta  = read_imagef(bias, coord_para.xy);\n\
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    mean_vari.y = read_imagef(meanVari, coord_para.wy).x;\n\
+    float4 data = read_imagef(input, coord);\n\
+\n\
+    float scale_vari, bias_val;\n\
+\n\
+    scale_vari = gamma.s0 * mean_vari.s1;\n\
+    bias_val = beta.s0 - scale_vari * mean_vari.s0;\n\
+\n\
+    float4 dst;\n\
+\n\
+    dst.x = data.x * scale_vari + bias_val;\n\
+    write_imagef(output, coord, dst);\n\
+}\n\
+"; /* end of group_normalization_f32_cl*/
+
+static const char group_normalization_i32_cl[] = "__kernel void group_norm_sumsqr_I32(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_t  output,\n\
+    float eps,\n\
+    int is2d,\n\
+    float input_zp,\n\
+    float input_scale,\n\
+    int width,\n\
+    int height\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int lidx = get_local_id(0);\n\
+\n\
+    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
+    float4 data;\n\
+    float sum = 0, sqr = 0;\n\
+    float tmpSum = 0;\n\
+    float e2InScale = input_scale * input_scale;\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    if(gidx < width)\n\
+    {\n\
+        for(coord.y = 0; coord.y < height;)\n\
+        {\n\
+            data = convert_float4(read_imagei(input, coord));\n\
+            coord.y++;\n\
+            tmpSum += data.x;\n\
+            sqr += (data.x * data.x * e2InScale);\n\
+        }\n\
+        sum = tmpSum * input_scale;\n\
+    }\n\
+    lcl_sum[lidx] = sum;\n\
+    lcl_sqr[lidx] = sqr;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+\n\
+        sum = 0; sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+\n\
+        float4 dst = (float4)(0);\n\
+        dst.x = sum;\n\
+        write_imagef(output, coord_out.xy, dst);\n\
+        coord_out.x++;\n\
+        dst.x = sqr;\n\
+        write_imagef(output, coord_out.xy, dst);\n\
+    }\n\
+}\n\
+\n\
+__kernel void group_norm_sumsqr_I32_2D(\n\
+    __read_only image2d_t   input,\n\
+    __write_only image2d_t  output,\n\
+    float eps,\n\
+    int is2d,\n\
+    float input_zp,\n\
+    float input_scale,\n\
+    int width,\n\
+    int height\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int lidx = get_local_id(0);\n\
+\n\
+    int2 coord = (int2)(gidx, gidz);\n\
+    float4 data;\n\
+    float sum = 0, sqr = 0;\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    if(gidx < width)\n\
+    {\n\
+        data = convert_float4(read_imagei(input, coord));\n\
+        sum = data.x * input_scale;\n\
+        sqr = sum * sum;\n\
+    }\n\
+    lcl_sum[lidx] = sum;\n\
+    lcl_sqr[lidx] = sqr;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+\n\
+        sum = 0; sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+\n\
+        float4 dst = (float4)(0);\n\
+        dst.x = sum;\n\
+        write_imagef(output, coord_out.xy, dst);\n\
+        coord_out.x++;\n\
+        dst.x = sqr;\n\
+        write_imagef(output, coord_out.xy, dst);\n\
+    }\n\
+}\n\
+\n\
+__kernel void group_norm_I32toI32(\n\
+    __read_only image2d_array_t   input,\n\
+    __read_only image2d_t   bias,\n\
+    __read_only image2d_t   scale,\n\
+    __read_only image2d_t   meanVari,\n\
+    __write_only image2d_array_t  output,\n\
+    float eps,\n\
+    int is2d,\n\
+    float input_zp,\n\
+    float input_scale,\n\
+    float output_zp,\n\
+    float output_scale,\n\
+    float rSpaceOrg,\n\
+    int width,\n\
+    int height,\n\
+    int pStride\n\
+    )\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 1);\n\
+\n\
+    float4 gamma = read_imagef(scale, coord_para.xy);\n\
+    float4 beta  = read_imagef(bias, coord_para.xy);\n\
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    mean_vari.y = read_imagef(meanVari, coord_para.wy).x;\n\
+    float4 data = convert_float4(read_imagei(input, coord));\n\
+\n\
+    float scale_vari, bias_val;\n\
+\n\
+    scale_vari = gamma.s0 * mean_vari.s1;\n\
+    float alpha = input_scale * output_scale * scale_vari;\n\
+    bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale;\n\
+\n\
+    int4 dst;\n\
+    float4 norm;\n\
+    norm.x = data.x * alpha + bias_val;\n\
+    dst = convert_int4_rte(norm);\n\
+    write_imagei(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void group_norm_I32toI32_2D(\n\
+    __read_only image2d_t   input,\n\
+    __read_only image2d_t   bias,\n\
+    __read_only image2d_t   scale,\n\
+    __read_only image2d_t   meanVari,\n\
+    __write_only image2d_t  output,\n\
+    float eps,\n\
+    int is2d,\n\
+    float input_zp,\n\
+    float input_scale,\n\
+    float output_zp,\n\
+    float output_scale,\n\
+    float rSpaceOrg,\n\
+    int width,\n\
+    int height,\n\
+    int pStride\n\
+    )\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int2 coord = (int2)(get_global_id(0), gidz);\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 1);\n\
+\n\
+    float4 gamma = read_imagef(scale, coord_para.xy);\n\
+    float4 beta  = read_imagef(bias, coord_para.xy);\n\
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    mean_vari.y = read_imagef(meanVari, coord_para.wy).x;\n\
+    float4 data = convert_float4(read_imagei(input, coord));\n\
+\n\
+    float scale_vari, bias_val;\n\
+\n\
+    scale_vari = gamma.s0 * mean_vari.s1;\n\
+    float alpha = input_scale * output_scale * scale_vari;\n\
+    bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale;\n\
+\n\
+    int4 dst;\n\
+    float4 norm;\n\
+    norm.x = data.x * alpha + bias_val;\n\
+    dst = convert_int4_rte(norm);\n\
+    write_imagei(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void group_norm_I32toF32(\n\
+    __read_only image2d_array_t   input,\n\
+    __read_only image2d_t   bias,\n\
+    __read_only image2d_t   scale,\n\
+    __read_only image2d_t   meanVari,\n\
+    __write_only image2d_array_t  output,\n\
+    float eps,\n\
+    int is2d,\n\
+    float input_zp,\n\
+    float input_scale,\n\
+    float output_zp,\n\
+    float output_scale,\n\
+    float rSpaceOrg,\n\
+    int width,\n\
+    int height,\n\
+    int pStride\n\
+    )\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 1);\n\
+\n\
+    float4 gamma = read_imagef(scale, coord_para.xy);\n\
+    float4 beta  = read_imagef(bias, coord_para.xy);\n\
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    mean_vari.y = read_imagef(meanVari, coord_para.wy).x;\n\
+    float4 data = convert_float4(read_imagei(input, coord));\n\
+\n\
+    float scale_vari, bias_val;\n\
+\n\
+    scale_vari = gamma.s0 * mean_vari.s1;\n\
+    float alpha = input_scale * scale_vari;\n\
+    bias_val = (beta.s0 - scale_vari * mean_vari.s0);\n\
+\n\
+    float4 norm;\n\
+    norm.x = data.x * alpha + bias_val;\n\
+    write_imagef(output, coord, norm);\n\
+}\n\
+\n\
+__kernel void group_norm_I32toF32_2D(\n\
+    __read_only image2d_t   input,\n\
+    __read_only image2d_t   bias,\n\
+    __read_only image2d_t   scale,\n\
+    __read_only image2d_t   meanVari,\n\
+    __write_only image2d_t  output,\n\
+    float eps,\n\
+    int is2d,\n\
+    float input_zp,\n\
+    float input_scale,\n\
+    float output_zp,\n\
+    float output_scale,\n\
+    float rSpaceOrg,\n\
+    int width,\n\
+    int height,\n\
+    int pStride\n\
+    )\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int2 coord = (int2)(get_global_id(0), gidz);\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 1);\n\
+\n\
+    float4 gamma = read_imagef(scale, coord_para.xy);\n\
+    float4 beta  = read_imagef(bias, coord_para.xy);\n\
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    mean_vari.y = read_imagef(meanVari, coord_para.wy).x;\n\
+    float4 data = convert_float4(read_imagei(input, coord));\n\
+\n\
+    float scale_vari, bias_val;\n\
+\n\
+    scale_vari = gamma.s0 * mean_vari.s1;\n\
+    float alpha = input_scale * scale_vari;\n\
+    bias_val = beta.s0 - scale_vari * mean_vari.s0;\n\
+\n\
+    float4 norm;\n\
+    norm.x = data.x * alpha + bias_val;\n\
+    write_imagef(output, coord, norm);\n\
+}\n\
+"; /* end of group_normalization_i32_cl*/
+
+static const char group_normalization_u8_cl[] = "__kernel void group_norm_sumsqr_U8(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_t  output,\n\
+    float eps,\n\
+    int is2d,\n\
+    float input_zp,\n\
+    float input_scale,\n\
+    int width,\n\
+    int height\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int lidx = get_local_id(0);\n\
+\n\
+    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
+    float4 data;\n\
+    float sum = 0, sqr = 0;\n\
+    float tmpSum = 0, tmpSqr = 0;\n\
+    float e2InScale = input_scale * input_scale;\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    if(gidx < width)\n\
+    {\n\
+        for(coord.y = 0; coord.y < height;)\n\
+        {\n\
+            data = convert_float4(read_imageui(input, coord));\n\
+            coord.y++;\n\
+            tmpSum += data.x;\n\
+            tmpSqr += data.x * data.x;\n\
+        }\n\
+        sqr = (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;\n\
+        sum = (tmpSum - height * input_zp) * input_scale;\n\
+    }\n\
+    lcl_sum[lidx] = sum;\n\
+    lcl_sqr[lidx] = sqr;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+\n\
+        sum = 0; sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+\n\
+        float4 dst = (float4)(0);\n\
+        dst.x = sum;\n\
+        write_imagef(output, coord_out.xy, dst);\n\
+        coord_out.x++;\n\
+        dst.x = sqr;\n\
+        write_imagef(output, coord_out.xy, dst);\n\
+    }\n\
+}\n\
+\n\
+__kernel void group_norm_sumsqr_U8_2D(\n\
+    __read_only image2d_t   input,\n\
+    __write_only image2d_t  output,\n\
+    float eps,\n\
+    int is2d,\n\
+    float input_zp,\n\
+    float input_scale,\n\
+    int width,\n\
+    int height\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int lidx = get_local_id(0);\n\
+\n\
+    int2 coord = (int2)(gidx, gidz);\n\
+    float4 data;\n\
+    float sum = 0, sqr = 0;\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    if(gidx < width)\n\
+    {\n\
+        data = convert_float4(read_imageui(input, coord));\n\
+        sum = (data.x - input_zp) * input_scale;\n\
+        sqr = sum * sum;\n\
+    }\n\
+    lcl_sum[lidx] = sum;\n\
+    lcl_sqr[lidx] = sqr;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int4 coord_out = (int4)(get_group_id(0) << 2, gidz, 0, 0);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+\n\
+        sum = 0; sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+\n\
+        float4 dst = (float4)(0);\n\
+        dst.x = sum;\n\
+        write_imagef(output, coord_out.xy, dst);\n\
+        coord_out.x++;\n\
+        dst.x = sqr;\n\
+        write_imagef(output, coord_out.xy, dst);\n\
+    }\n\
+}\n\
+\n\
+__kernel void group_norm_U8toU8(\n\
+    __read_only image2d_array_t   input,\n\
+    __read_only image2d_t   bias,\n\
+    __read_only image2d_t   scale,\n\
+    __read_only image2d_t   meanVari,\n\
+    __write_only image2d_array_t  output,\n\
+    float eps,\n\
+    int is2d,\n\
+    float input_zp,\n\
+    float input_scale,\n\
+    float output_zp,\n\
+    float output_scale,\n\
+    float rSpaceOrg,\n\
+    int width,\n\
+    int height,\n\
+    int pStride\n\
+    )\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 1);\n\
+\n\
+    float4 gamma = read_imagef(scale, coord_para.xy);\n\
+    float4 beta  = read_imagef(bias, coord_para.xy);\n\
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    mean_vari.y = read_imagef(meanVari, coord_para.wy).x;\n\
+    float4 data = convert_float4(read_imageui(input, coord));\n\
+\n\
+    float scale_vari, bias_val;\n\
+    float scale_inOut = input_scale * output_scale;\n\
+\n\
+    scale_vari = gamma.s0 * mean_vari.s1;\n\
+    float alpha = scale_inOut * scale_vari;\n\
+    bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\
+\n\
+    uint4 dst;\n\
+    data.x -= input_zp;\n\
+    float4 norm;\n\
+    norm.x = data.x * alpha + bias_val;\n\
+    dst = convert_uint4_rte(norm);\n\
+    write_imageui(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void group_norm_U8toU8_2D(\n\
+    __read_only image2d_t   input,\n\
+    __read_only image2d_t   bias,\n\
+    __read_only image2d_t   scale,\n\
+    __read_only image2d_t   meanVari,\n\
+    __write_only image2d_t  output,\n\
+    float eps,\n\
+    int is2d,\n\
+    float input_zp,\n\
+    float input_scale,\n\
+    float output_zp,\n\
+    float output_scale,\n\
+    float rSpaceOrg,\n\
+    int width,\n\
+    int height,\n\
+    int pStride\n\
+    )\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int2 coord = (int2)(get_global_id(0), gidz);\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 1);\n\
+\n\
+    float4 gamma = read_imagef(scale, coord_para.xy);\n\
+    float4 beta  = read_imagef(bias, coord_para.xy);\n\
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    mean_vari.y = read_imagef(meanVari, coord_para.wy).x;\n\
+    float4 data = convert_float4(read_imageui(input, coord));\n\
+\n\
+    float scale_vari, bias_val;\n\
+    float scale_inOut = input_scale * output_scale;\n\
+\n\
+    scale_vari = gamma.s0 * mean_vari.s1;\n\
+    float alpha = scale_inOut * scale_vari;\n\
+    bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\
+\n\
+    uint4 dst;\n\
+    data.x -= input_zp;\n\
+    float4 norm;\n\
+    norm.x = data.x * alpha + bias_val;\n\
+    dst = convert_uint4_rte(norm);\n\
+    write_imageui(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void group_norm_U8toF32(\n\
+    __read_only image2d_array_t   input,\n\
+    __read_only image2d_t   bias,\n\
+    __read_only image2d_t   scale,\n\
+    __read_only image2d_t   meanVari,\n\
+    __write_only image2d_array_t  output,\n\
+    float eps,\n\
+    int is2d,\n\
+    float input_zp,\n\
+    float input_scale,\n\
+    float output_zp,\n\
+    float output_scale,\n\
+    float rSpaceOrg,\n\
+    int width,\n\
+    int height,\n\
+    int pStride\n\
+    )\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 1);\n\
+\n\
+    float4 gamma = read_imagef(scale, coord_para.xy);\n\
+    float4 beta  = read_imagef(bias, coord_para.xy);\n\
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    mean_vari.y = read_imagef(meanVari, coord_para.wy).x;\n\
+    float4 data = convert_float4(read_imageui(input, coord));\n\
+\n\
+    float scale_vari, bias_val;\n\
+    float scale_inOut = input_scale * output_scale;\n\
+\n\
+    scale_vari = gamma.s0 * mean_vari.s1;\n\
+    float alpha = scale_inOut * scale_vari;\n\
+    bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\
+\n\
+    data.x -= input_zp;\n\
+    float4 norm;\n\
+    norm.x = data.x * alpha + bias_val;\n\
+    write_imagef(output, coord, norm);\n\
+}\n\
+\n\
+__kernel void group_norm_U8toF32_2D(\n\
+    __read_only image2d_t   input,\n\
+    __read_only image2d_t   bias,\n\
+    __read_only image2d_t   scale,\n\
+    __read_only image2d_t   meanVari,\n\
+    __write_only image2d_t  output,\n\
+    float eps,\n\
+    int is2d,\n\
+    float input_zp,\n\
+    float input_scale,\n\
+    float output_zp,\n\
+    float output_scale,\n\
+    float rSpaceOrg,\n\
+    int width,\n\
+    int height,\n\
+    int pStride\n\
+    )\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int2 coord = (int2)(get_global_id(0), gidz);\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 1);\n\
+\n\
+    float4 gamma = read_imagef(scale, coord_para.xy);\n\
+    float4 beta  = read_imagef(bias, coord_para.xy);\n\
+    float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    mean_vari.y = read_imagef(meanVari, coord_para.wy).x;\n\
+    float4 data = convert_float4(read_imageui(input, coord));\n\
+\n\
+    float scale_vari, bias_val;\n\
+    float scale_inOut = input_scale * output_scale;\n\
+\n\
+    scale_vari = gamma.s0 * mean_vari.s1;\n\
+    float alpha = scale_inOut * scale_vari;\n\
+    bias_val = (beta.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp;\n\
+\n\
+    data.x -= input_zp;\n\
+    float4 norm;\n\
+    norm.x = data.x * alpha + bias_val;\n\
+    write_imagef(output, coord, norm);\n\
+}\n\
+"; /* end of group_normalization_u8_cl*/
+
 static const char grucell_activation_cl[] = "__kernel void grucell_activation(\n\
     __read_only image2d_array_t   input,\n\
     __write_only image2d_array_t  output)\n\
@@ -44329,7 +50756,7 @@ __kernel void gemm_transb_F32F32toF32_3D(\n\
 \n\
     coord_a.x = get_global_id(0);\n\
     coord_a.z = get_global_id(2);\n\
-    write_imagef(output, coord_b, sum);\n\
+    write_imagef(output, coord_a, sum);\n\
 }\n\
 \n\
 __kernel void gemm_transb_F32I8toF32_2D(\n\
@@ -44405,7 +50832,7 @@ __kernel void gemm_transb_F32I8toF32_3D(\n\
 \n\
     coord_a.x = get_global_id(0);\n\
     coord_a.z = get_global_id(2);\n\
-    write_imagef(output, coord_b, sum);\n\
+    write_imagef(output, coord_a, sum);\n\
 }\n\
 "; /* end of matrixmul_cl*/
 
@@ -45510,6 +51937,138 @@ __kernel void moments_axis2_I32toF32(\n\
     write_imagef(output_vari, coord_out, vari);\n\
 }"; /* end of moments_axis2_cl*/
 
+static const char one_hot_cl[] = "__kernel void one_hot_F32toF32\n\
+    (\n\
+        __read_only  image2d_t       input,\n\
+        __write_only image2d_array_t output,\n\
+                     int             depth,\n\
+                     float           on_value,\n\
+                     float           off_value,\n\
+                     float           inputScale,\n\
+                     float           inputTail\n\
+    )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+\n\
+    float4 val = read_imagef(input, coord.xy);\n\
+\n\
+    do\n\
+    {\n\
+        float4 dst;\n\
+        dst.x = convert_int(val.x) == coord.z ? on_value : off_value;\n\
+\n\
+        write_imagef(output, coord.xzyw, dst.xxxx);\n\
+\n\
+        coord.z ++;\n\
+    } while (coord.z < depth);\n\
+}\n\
+\n\
+__kernel void one_hot_I32toI32\n\
+    (\n\
+        __read_only  image2d_t       input,\n\
+        __write_only image2d_array_t output,\n\
+                     int             depth,\n\
+                     int             on_value,\n\
+                     int             off_value,\n\
+                     float           inputScale,\n\
+                     float           inputTail\n\
+    )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+\n\
+    int4 val = read_imagei(input, coord.xy);\n\
+\n\
+    do\n\
+    {\n\
+        int4 dst;\n\
+        dst.x = val.x == coord.z ? on_value : off_value;\n\
+\n\
+        write_imagei(output, coord.xzyw, dst.xxxx);\n\
+\n\
+        coord.z ++;\n\
+    } while (coord.z < depth);\n\
+}\n\
+\n\
+__kernel void one_hot_I32toU8\n\
+    (\n\
+        __read_only  image2d_t       input,\n\
+        __write_only image2d_array_t output,\n\
+                     int             depth,\n\
+                     uint            on_value,\n\
+                     uint            off_value,\n\
+                     float           inputScale,\n\
+                     float           inputTail\n\
+    )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+\n\
+    int4 val = read_imagei(input, coord.xy);\n\
+    do\n\
+    {\n\
+        uint4 dst;\n\
+        dst.x = val.x == coord.z ? on_value : off_value;\n\
+\n\
+        write_imageui(output, coord.xzyw, dst.xxxx);\n\
+\n\
+        coord.z ++;\n\
+    } while (coord.z < depth);\n\
+}\n\
+\n\
+__kernel void one_hot_I32toF32\n\
+    (\n\
+        __read_only  image2d_t       input,\n\
+        __write_only image2d_array_t output,\n\
+                     int             depth,\n\
+                     float           on_value,\n\
+                     float           off_value,\n\
+                     float           inputScale,\n\
+                     float           inputTail\n\
+    )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+\n\
+    int4 val = read_imagei(input, coord.xy);\n\
+\n\
+    do\n\
+    {\n\
+        float4 dst;\n\
+        dst.x = val.x == coord.z ? on_value : off_value;\n\
+\n\
+        write_imagef(output, coord.xzyw, dst.xxxx);\n\
+\n\
+        coord.z ++;\n\
+    } while (coord.z < depth);\n\
+}\n\
+\n\
+__kernel void one_hot_U8toU8\n\
+    (\n\
+        __read_only  image2d_t       input,\n\
+        __write_only image2d_array_t output,\n\
+                     int             depth,\n\
+                     uint            on_value,\n\
+                     uint            off_value,\n\
+                     float           inputScale,\n\
+                     float           inputTail\n\
+    )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+\n\
+    uint4 src = read_imageui(input, coord.xy);\n\
+\n\
+    int  val = convert_int(convert_float(src.x) * inputScale - inputTail);\n\
+\n\
+    do\n\
+    {\n\
+        uint4 dst;\n\
+        dst.x = val == coord.z ? on_value : off_value;\n\
+\n\
+        write_imageui(output, coord.xzyw, dst.xxxx);\n\
+\n\
+        coord.z ++;\n\
+    } while (coord.z < depth);\n\
+}\n\
+"; /* end of one_hot_cl*/
+
 static const char poolwithargmax_cl[] = "\n\
 #define POOLWITHARGMAX_PROCESS(data_type, read_fun, write_fun0, write_fun1) \\\n\
     data_type src  = 0; \\\n\
@@ -47876,6 +54435,184 @@ __kernel void relu_keras_U8toF32_2D(\n\
     write_imagef(output, coord, dst);\n\
 }"; /* end of relu_keras_cl*/
 
+static const char repeat_cl[] = "__kernel void repeat_I32_axis0(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int width, int height, int channel, int axis)\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
+\n\
+    for(coord.y = 0; coord.y < height;)\n\
+    {\n\
+        int4 data = read_imagei(input0, coord);\n\
+        int4 len = read_imagei(input1, coord.yw);\n\
+        coord.y++;\n\
+        for(int i = 0; i < len.x; i++)\n\
+        {\n\
+            write_imagei(output, coord_out, data);\n\
+            coord_out.y++;\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void repeat_I32_axis1(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int width, int height, int channel, int axis)\n\
+{\n\
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
+\n\
+    for(coord.x = 0; coord.x < width;)\n\
+    {\n\
+        int4 data = read_imagei(input0, coord);\n\
+        int4 len = read_imagei(input1, coord.xw);\n\
+        coord.x++;\n\
+        for(int i = 0; i < len.x; i++)\n\
+        {\n\
+            write_imagei(output, coord_out, data);\n\
+            coord_out.x++;\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void repeat_I32_axis2(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int width, int height, int channel, int axis)\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_out = coord;\n\
+\n\
+    for(coord.z = 0; coord.z < channel;)\n\
+    {\n\
+        int4 data = read_imagei(input0, coord);\n\
+        int4 len = read_imagei(input1, coord.zw);\n\
+        coord.z++;\n\
+        for(int i = 0; i < len.x; i++)\n\
+        {\n\
+            write_imagei(output, coord_out, data);\n\
+            coord_out.z++;\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void repeat_I32_1D(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int width, int height, int channel, int axis)\n\
+{\n\
+    int2 coord = (int2)(0, 0);\n\
+    int2 coord_out = coord;\n\
+\n\
+    for(coord.x = 0; coord.x < width;)\n\
+    {\n\
+        int4 data = read_imagei(input0, coord);\n\
+        int4 len = read_imagei(input1, coord.xy);\n\
+        coord.x++;\n\
+        for(int i = 0; i < len.x; i++)\n\
+        {\n\
+            write_imagei(output, coord_out, data);\n\
+            coord_out.x++;\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void repeat_F32_axis0(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int width, int height, int channel, int axis)\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
+\n\
+    for(coord.y = 0; coord.y < height;)\n\
+    {\n\
+        float4 data = read_imagef(input0, coord);\n\
+        int4 len = read_imagei(input1, coord.yw);\n\
+        coord.y++;\n\
+        for(int i = 0; i < len.x; i++)\n\
+        {\n\
+            write_imagef(output, coord_out, data);\n\
+            coord_out.y++;\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void repeat_F32_axis1(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int width, int height, int channel, int axis)\n\
+{\n\
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
+\n\
+    for(coord.x = 0; coord.x < width;)\n\
+    {\n\
+        float4 data = read_imagef(input0, coord);\n\
+        int4 len = read_imagei(input1, coord.xw);\n\
+        coord.x++;\n\
+        for(int i = 0; i < len.x; i++)\n\
+        {\n\
+            write_imagef(output, coord_out, data);\n\
+            coord_out.x++;\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void repeat_F32_axis2(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int width, int height, int channel, int axis)\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    int4 coord_out = coord;\n\
+\n\
+    for(coord.z = 0; coord.z < channel;)\n\
+    {\n\
+        float4 data = read_imagef(input0, coord);\n\
+        int4 len = read_imagei(input1, coord.zw);\n\
+        coord.z++;\n\
+        for(int i = 0; i < len.x; i++)\n\
+        {\n\
+            write_imagef(output, coord_out, data);\n\
+            coord_out.z++;\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void repeat_F32_1D(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int width, int height, int channel, int axis)\n\
+{\n\
+    int2 coord = (int2)(0, 0);\n\
+    int2 coord_out = coord;\n\
+\n\
+    for(coord.x = 0; coord.x < width;)\n\
+    {\n\
+        float4 data = read_imagef(input0, coord);\n\
+        int4 len = read_imagei(input1, coord.xy);\n\
+        coord.x++;\n\
+        for(int i = 0; i < len.x; i++)\n\
+        {\n\
+            write_imagef(output, coord_out, data);\n\
+            coord_out.x++;\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+"; /* end of repeat_cl*/
+
 static const char resize_1d_bilinear_cl[] = "__kernel void resize_1d_bilinear_F32toF32(\n\
     __read_only  image2d_array_t  input,\n\
     __write_only image2d_array_t  output,\n\
@@ -48572,6 +55309,225 @@ __kernel void select_I8_F32_F32toF32_2D(\n\
 }\n\
 "; /* end of select_cl*/
 
+static const char sequence_mask_cl[] = "\n\
+__kernel void sequence_mask_I32toU8(\n\
+    image2d_t input, image2d_array_t output, int maxLen,\n\
+    float input_scale, float input_zpScale, float outputVal1, int output_ZP)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0);\n\
+    int4 index = read_imagei(input, coord.yz);\n\
+    uint4 data;\n\
+    data.x = gidx < index.x ? convert_uint_rte(outputVal1) : (uint)(output_ZP);\n\
+    write_imageui(output, coord, data);\n\
+}\n\
+\n\
+__kernel void sequence_mask_I32toU8_2D(\n\
+    image2d_t input, image2d_t output, int maxLen,\n\
+    float input_scale, float input_zpScale, float outputVal1, int output_ZP)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int2 coord = (int2)(gidx, get_global_id(1));\n\
+    int4 index = read_imagei(input, coord.yy);\n\
+    uint4 data;\n\
+    data.x = gidx < index.x ? convert_uint_rte(outputVal1) : (uint)(output_ZP);\n\
+    write_imageui(output, coord, data);\n\
+}\n\
+\n\
+__kernel void sequence_mask_I32toI32(\n\
+    image2d_t input, image2d_array_t output, int maxLen,\n\
+    float input_scale, float input_zpScale, float outputVal1, int output_ZP)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0);\n\
+    int4 index = read_imagei(input, coord.yz);\n\
+    int4 data;\n\
+    data = gidx < index.x ? (int4)(1) : (int4)(0);\n\
+    write_imagei(output, coord, data);\n\
+}\n\
+\n\
+__kernel void sequence_mask_I32toI32_2D(\n\
+    image2d_t input, image2d_t output, int maxLen,\n\
+    float input_scale, float input_zpScale, float outputVal1, int output_ZP)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int2 coord = (int2)(gidx, get_global_id(1));\n\
+    int4 index = read_imagei(input, coord.yy);\n\
+    int4 data;\n\
+    data = gidx < index.x ? (int4)(1) : (int4)(0);\n\
+    write_imagei(output, coord, data);\n\
+}\n\
+\n\
+__kernel void sequence_mask_I32toF32(\n\
+    image2d_t input, image2d_array_t output, int maxLen,\n\
+    float input_scale, float input_zpScale, float outputVal1, int output_ZP)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int4 coord = (int4)(gidx, get_global_id(1), get_global_id(2), 0);\n\
+    int4 index = read_imagei(input, coord.yz);\n\
+    float4 data;\n\
+    data = gidx < index.x ? (float4)(1.0f) : (float4)(0.0f);\n\
+    write_imagef(output, coord, data);\n\
+}\n\
+\n\
+__kernel void sequence_mask_I32toF32_2D(\n\
+    image2d_t input, image2d_t output, int maxLen,\n\
+    float input_scale, float input_zpScale, float outputVal1, int output_ZP)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int2 coord = (int2)(gidx, get_global_id(1));\n\
+    int4 index = read_imagei(input, coord.yy);\n\
+    float4 data;\n\
+    data = gidx < index.x ? (float4)(1.0f) : (float4)(0.0f);\n\
+    write_imagef(output, coord, data);\n\
+}"; /* end of sequence_mask_cl*/
+
+static const char slice_cl[] = "__kernel void slice_F32_I32toF32\n\
+    (\n\
+    __read_only  image2d_array_t input0,\n\
+    __read_only  image2d_t       input1,\n\
+    __write_only image2d_array_t output,\n\
+                 float           inputScale,\n\
+                 float           inputTail,\n\
+                 float           outputScale,\n\
+                 float           outputZP\n\
+    )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_in;\n\
+    Image begin_img = create_image_from_image2d(input1, 4);\n\
+    uchar* begin_ptr = begin_img.ptr;\n\
+    int4 begin = ((int4 *)begin_ptr)[0];\n\
+\n\
+    coord_in = coord + begin;\n\
+    float4 src = read_imagef(input0, coord_in);\n\
+\n\
+    write_imagef(output, coord, src);\n\
+}\n\
+\n\
+__kernel void slice_F32_I32toF32_2D\n\
+    (\n\
+    __read_only  image2d_t input0,\n\
+    __read_only  image2d_t input1,\n\
+    __write_only image2d_t output,\n\
+                 float           inputScale,\n\
+                 float           inputTail,\n\
+                 float           outputScale,\n\
+                 float           outputZP\n\
+    )\n\
+{\n\
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
+    int2 coord_in;\n\
+    Image begin_img = create_image_from_image2d(input1, 4);\n\
+    uchar* begin_ptr = begin_img.ptr;\n\
+    int2 begin = ((int2 *)begin_ptr)[0];\n\
+\n\
+    coord_in = coord + begin;\n\
+    float4 src = read_imagef(input0, coord_in);\n\
+\n\
+    write_imagef(output, coord, src);\n\
+}\n\
+\n\
+__kernel void slice_U8_I32toU8\n\
+    (\n\
+    __read_only  image2d_array_t input0,\n\
+    __read_only  image2d_t       input1,\n\
+    __write_only image2d_array_t output,\n\
+                 float           inputScale,\n\
+                 float           inputTail,\n\
+                 float           outputScale,\n\
+                 float           outputZP\n\
+    )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_in;\n\
+    Image begin_img = create_image_from_image2d(input1, 4);\n\
+    uchar* begin_ptr = begin_img.ptr;\n\
+    int4 begin = ((int4 *)begin_ptr)[0];\n\
+\n\
+    coord_in = coord + begin;\n\
+    uint4 src = read_imageui(input0, coord_in);\n\
+\n\
+    float4 data = convert_float4(src) * inputScale - inputTail;\n\
+    uint4 dst = convert_uint4(data * outputScale + outputZP);\n\
+\n\
+    write_imageui(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void slice_U8_I32toU8_2D\n\
+    (\n\
+    __read_only  image2d_t input0,\n\
+    __read_only  image2d_t input1,\n\
+    __write_only image2d_t output,\n\
+                 float           inputScale,\n\
+                 float           inputTail,\n\
+                 float           outputScale,\n\
+                 float           outputZP\n\
+    )\n\
+{\n\
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
+    int2 coord_in;\n\
+    Image begin_img = create_image_from_image2d(input1, 4);\n\
+    uchar* begin_ptr = begin_img.ptr;\n\
+    int2 begin = ((int2 *)begin_ptr)[0];\n\
+\n\
+    coord_in = coord + begin;\n\
+    uint4 src = read_imageui(input0, coord_in);\n\
+\n\
+    float4 data = convert_float4(src) * inputScale - inputTail;\n\
+    uint4 dst = convert_uint4(data * outputScale + outputZP);\n\
+\n\
+    write_imageui(output, coord, dst);\n\
+}\n\
+\n\
+__kernel void slice_I32_I32toI32\n\
+    (\n\
+    __read_only  image2d_array_t input0,\n\
+    __read_only  image2d_t       input1,\n\
+    __write_only image2d_array_t output,\n\
+                 float           inputScale,\n\
+                 float           inputTail,\n\
+                 float           outputScale,\n\
+                 float           outputZP\n\
+    )\n\
+{\n\
+    int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_in;\n\
+    Image begin_img = create_image_from_image2d(input1, 4);\n\
+    uchar* begin_ptr = begin_img.ptr;\n\
+    int4 begin = ((int4 *)begin_ptr)[0];\n\
+\n\
+    coord_in = coord + begin;\n\
+    int4 src = read_imagei(input0, coord_in);\n\
+\n\
+    write_imagei(output, coord, src);\n\
+}\n\
+\n\
+__kernel void slice_I32_I32toI32_2D\n\
+    (\n\
+    __read_only  image2d_t input0,\n\
+    __read_only  image2d_t input1,\n\
+    __write_only image2d_t output,\n\
+                 float           inputScale,\n\
+                 float           inputTail,\n\
+                 float           outputScale,\n\
+                 float           outputZP\n\
+    )\n\
+{\n\
+    int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
+    int2 coord_in;\n\
+    Image begin_img = create_image_from_image2d(input1, 4);\n\
+    uchar* begin_ptr = begin_img.ptr;\n\
+    int2 begin = ((int2 *)begin_ptr)[0];\n\
+\n\
+    coord_in = coord + begin;\n\
+    int4 src = read_imagei(input0, coord_in);\n\
+\n\
+    write_imagei(output, coord, src);\n\
+}\n\
+\n\
+"; /* end of slice_cl*/
+
 static const char space2depth_internal_cl[] = "\n\
 __kernel void space2depth_internal_F32toF32 (\n\
         image2d_array_t    input,\n\
@@ -49102,11 +56058,14 @@ static const source_map_t evis_resource[] =
     {"argmin_axis1_vx", argmin_axis1_vx},
     {"argmin_axis2_vx", argmin_axis2_vx},
     {"batchnorm_single_vx", batchnorm_single_vx},
+    {"batchnorm_single_f32_vx", batchnorm_single_f32_vx},
     {"cast_vx", cast_vx},
     {"clip_F16_vx", clip_F16_vx},
     {"clip_I16_vx", clip_I16_vx},
     {"clip_I8_vx", clip_I8_vx},
     {"clip_U8_vx", clip_U8_vx},
+    {"conv1d_ovxlib_vx", conv1d_ovxlib_vx},
+    {"conv1d_ovxlib_k1024_vx", conv1d_ovxlib_k1024_vx},
     {"depth2space_crd_vx", depth2space_crd_vx},
     {"depthwise_conv1d_src0_vx", depthwise_conv1d_src0_vx},
     {"depthwise_conv1d_src1_vx", depthwise_conv1d_src1_vx},
@@ -49115,8 +56074,10 @@ static const source_map_t evis_resource[] =
     {"detect_post_box_vx", detect_post_box_vx},
     {"eltwise_unary_2d_vx", eltwise_unary_2d_vx},
     {"eltwise_unary_3d_vx", eltwise_unary_3d_vx},
+    {"erf_vx", erf_vx},
     {"floordiv_vx", floordiv_vx},
     {"gather_vx", gather_vx},
+    {"gather_array_vx", gather_array_vx},
     {"gather_mix_vx", gather_mix_vx},
     {"gather_nd_vx", gather_nd_vx},
     {"gather_nd_2d_vx", gather_nd_2d_vx},
@@ -49124,6 +56085,11 @@ static const source_map_t evis_resource[] =
     {"gather_nd_3d_vx", gather_nd_3d_vx},
     {"gather_nd_3d_mix_vx", gather_nd_3d_mix_vx},
     {"gather_nd_mix_vx", gather_nd_mix_vx},
+    {"group_normalization_f16_vx", group_normalization_f16_vx},
+    {"group_normalization_i16_vx", group_normalization_i16_vx},
+    {"group_normalization_i8_vx", group_normalization_i8_vx},
+    {"group_normalization_u8_vx", group_normalization_u8_vx},
+    {"group_normalization_u8_f16_vx", group_normalization_u8_f16_vx},
     {"grucell_activation_vx", grucell_activation_vx},
     {"grucell_activation_sma_vx", grucell_activation_sma_vx},
     {"grucell_cdnn_activation_vx", grucell_cdnn_activation_vx},
@@ -49132,12 +56098,19 @@ static const source_map_t evis_resource[] =
     {"instance_normalization_f16_vx", instance_normalization_f16_vx},
     {"instance_normalization_i16_vx", instance_normalization_i16_vx},
     {"instance_normalization_i8_vx", instance_normalization_i8_vx},
+    {"instance_normalization_scale_f32_vx", instance_normalization_scale_f32_vx},
+    {"instance_normalization_scale_f32_bf16_vx", instance_normalization_scale_f32_bf16_vx},
+    {"instance_normalization_scale_f32_f16_vx", instance_normalization_scale_f32_f16_vx},
     {"instance_normalization_u8_vx", instance_normalization_u8_vx},
+    {"instance_normalization_u8_f16_vx", instance_normalization_u8_f16_vx},
     {"l2normalizescale_axis0_vx", l2normalizescale_axis0_vx},
     {"l2normalizescale_axis1_vx", l2normalizescale_axis1_vx},
     {"layer_normalization_vx", layer_normalization_vx},
     {"layer_normalization_2d_vx", layer_normalization_2d_vx},
     {"layer_normalization_i16_vx", layer_normalization_i16_vx},
+    {"layer_normalization_scale_f32_vx", layer_normalization_scale_f32_vx},
+    {"layer_normalization_scale_f32_2d_vx", layer_normalization_scale_f32_2d_vx},
+    {"layer_normalization_scale_f32_bf16_vx", layer_normalization_scale_f32_bf16_vx},
     {"layer_normalization_u8_f16_vx", layer_normalization_u8_f16_vx},
     {"layer_normalization_wh_f16_vx", layer_normalization_wh_f16_vx},
     {"layer_normalization_wh_i16_vx", layer_normalization_wh_i16_vx},
@@ -49194,6 +56167,7 @@ static const source_map_t evis_resource[] =
     {"moments_axis012_vx", moments_axis012_vx},
     {"moments_axis1_vx", moments_axis1_vx},
     {"moments_axis2_vx", moments_axis2_vx},
+    {"one_hot_vx", one_hot_vx},
     {"poolwithargmax_F16_vx", poolwithargmax_F16_vx},
     {"poolwithargmax_I16_vx", poolwithargmax_I16_vx},
     {"poolwithargmax_I8_vx", poolwithargmax_I8_vx},
@@ -49241,6 +56215,8 @@ static const source_map_t evis_resource[] =
     {"relational_ops_2d_vx", relational_ops_2d_vx},
     {"relational_ops_3d_vx", relational_ops_3d_vx},
     {"relu_keras_vx", relu_keras_vx},
+    {"repeat_vx", repeat_vx},
+    {"repeat_axis1_vx", repeat_axis1_vx},
     {"resize_1d_bilinear_BF16_vx", resize_1d_bilinear_BF16_vx},
     {"resize_1d_bilinear_DOWN_NX_vx", resize_1d_bilinear_DOWN_NX_vx},
     {"resize_1d_bilinear_F16_vx", resize_1d_bilinear_F16_vx},
@@ -49255,12 +56231,14 @@ static const source_map_t evis_resource[] =
     {"resize_bilinear_I16_vx", resize_bilinear_I16_vx},
     {"resize_bilinear_I8_vx", resize_bilinear_I8_vx},
     {"resize_bilinear_U8_vx", resize_bilinear_U8_vx},
-    {"resize_bilinear_U8_UP_2X_vx", resize_bilinear_U8_UP_2X_vx},
+    {"resize_bilinear_U8_half_pixel_centers_vx", resize_bilinear_U8_half_pixel_centers_vx},
     {"resize_bilinear_U8_opt_vx", resize_bilinear_U8_opt_vx},
     {"resize_nearest_vx", resize_nearest_vx},
     {"scatter_nd_vx", scatter_nd_vx},
     {"scatter_nd_big_vx", scatter_nd_big_vx},
     {"select_vx", select_vx},
+    {"sequence_mask_vx", sequence_mask_vx},
+    {"slice_vx", slice_vx},
     {"space2depth_internal_vx", space2depth_internal_vx},
     {"swish_vx", swish_vx},
     {"tile_vx", tile_vx},
@@ -49271,11 +56249,9 @@ static const source_map_t evis_resource[] =
     {"upsample_U8_vx", upsample_U8_vx},
     {"upsamplescale_vx", upsamplescale_vx},
     {"upsamplescale_k2_vx", upsamplescale_k2_vx},
-    {"vsi_nn_kernel_axis_aligned_bbox_transform_vx", vsi_nn_kernel_axis_aligned_bbox_transform_vx},
     {"vsi_nn_kernel_box_with_nms_limit_vx", vsi_nn_kernel_box_with_nms_limit_vx},
     {"vsi_nn_kernel_detection_postprocess_vx", vsi_nn_kernel_detection_postprocess_vx},
     {"vsi_nn_kernel_extra_ending_vx", vsi_nn_kernel_extra_ending_vx},
-    {"vsi_nn_kernel_generate_proposals_vx", vsi_nn_kernel_generate_proposals_vx},
     {"vsi_nn_kernel_header_vx", vsi_nn_kernel_header_vx},
     {"vsi_nn_kernel_heatmap_max_keypoint_vx", vsi_nn_kernel_heatmap_max_keypoint_vx},
     {"vsi_nn_kernel_imageprocess_vx", vsi_nn_kernel_imageprocess_vx},
@@ -49286,7 +56262,6 @@ static const source_map_t evis_resource[] =
     {"vsi_nn_kernel_roi_align_vx", vsi_nn_kernel_roi_align_vx},
     {"vsi_nn_kernel_signalframe_vx", vsi_nn_kernel_signalframe_vx},
     {"vsi_nn_kernel_tensorstackconcat_vx", vsi_nn_kernel_tensorstackconcat_vx},
-    {"vsi_nn_kernel_topk_vx", vsi_nn_kernel_topk_vx},
     {"vsi_nn_kernel_transform_gemm_vx", vsi_nn_kernel_transform_gemm_vx},
     {"vsi_nn_kernel_transform_interp_vx", vsi_nn_kernel_transform_interp_vx},
     {"vsi_nn_kernel_transform_setupThres_vx", vsi_nn_kernel_transform_setupThres_vx},
@@ -49308,10 +56283,14 @@ static const source_map_t cl_resource[] =
     {"detect_post_box_cl", detect_post_box_cl},
     {"eltwise_ops_helper_cl", eltwise_ops_helper_cl},
     {"eltwise_unary_cl", eltwise_unary_cl},
+    {"erf_cl", erf_cl},
     {"floordiv_cl", floordiv_cl},
     {"gather_cl", gather_cl},
     {"gather_nd_cl", gather_nd_cl},
     {"gather_nd_3d_cl", gather_nd_3d_cl},
+    {"group_normalization_f32_cl", group_normalization_f32_cl},
+    {"group_normalization_i32_cl", group_normalization_i32_cl},
+    {"group_normalization_u8_cl", group_normalization_u8_cl},
     {"grucell_activation_cl", grucell_activation_cl},
     {"grucell_activation_sma_cl", grucell_activation_sma_cl},
     {"hswish_cl", hswish_cl},
@@ -49358,6 +56337,7 @@ static const source_map_t cl_resource[] =
     {"moments_axis012_cl", moments_axis012_cl},
     {"moments_axis1_cl", moments_axis1_cl},
     {"moments_axis2_cl", moments_axis2_cl},
+    {"one_hot_cl", one_hot_cl},
     {"poolwithargmax_cl", poolwithargmax_cl},
     {"pow_cl", pow_cl},
     {"prelu_cl", prelu_cl},
@@ -49379,6 +56359,7 @@ static const source_map_t cl_resource[] =
     {"reduceprod_internal_axis2_cl", reduceprod_internal_axis2_cl},
     {"relational_ops_cl", relational_ops_cl},
     {"relu_keras_cl", relu_keras_cl},
+    {"repeat_cl", repeat_cl},
     {"resize_1d_bilinear_cl", resize_1d_bilinear_cl},
     {"resize_1d_nearest_cl", resize_1d_nearest_cl},
     {"resize_bilinear_cl", resize_bilinear_cl},
@@ -49386,6 +56367,8 @@ static const source_map_t cl_resource[] =
     {"roi_align_cl", roi_align_cl},
     {"scatter_nd_cl", scatter_nd_cl},
     {"select_cl", select_cl},
+    {"sequence_mask_cl", sequence_mask_cl},
+    {"slice_cl", slice_cl},
     {"space2depth_internal_cl", space2depth_internal_cl},
     {"swish_cl", swish_cl},
     {"tile_cl", tile_cl},
diff --git a/src/tim/vx/internal/src/client/vsi_nn_vxkernel.c b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c
similarity index 98%
rename from src/tim/vx/internal/src/client/vsi_nn_vxkernel.c
rename to src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c
index 60c05b6..f79b691 100644
--- a/src/tim/vx/internal/src/client/vsi_nn_vxkernel.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c
@@ -30,7 +30,7 @@
 #include "vsi_nn_platform.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "libnnext/vsi_nn_libnnext_resource.h"
 #if VSI_USE_VXC_BINARY
@@ -308,7 +308,8 @@ static vsi_status vsi_nn_RegisterBinKernel
     context = graph->ctx;
     evis = context->config.evis.ver;
 
-    program_ptr = vsi_nn_VxBinResourceGetResource(kernel_info->resource_name[kernel_info->resource_num - 1], &program_len);
+    program_ptr = vsi_nn_VxBinResourceGetResource(
+            kernel_info->resource_name[kernel_info->resource_num - 1], &program_len);
     program = vxCreateProgramWithBinary(ctx, (const vx_uint8 *)program_ptr, program_len);
 
     status = vxGetStatus((vx_reference)program);
diff --git a/src/tim/vx/internal/src/makefile.linux b/src/tim/vx/internal/src/makefile.linux
index 799a920..45e11b8 100644
--- a/src/tim/vx/internal/src/makefile.linux
+++ b/src/tim/vx/internal/src/makefile.linux
@@ -60,9 +60,6 @@ OBJECTS =   $(OBJ_DIR)/vsi_nn_context.o \
             $(OBJ_DIR)/vsi_nn_graph_optimization.o \
             $(OBJ_DIR)/vsi_nn_pre_post_process.o
 
-vpath %.c client
-OBJECTS +=   $(OBJ_DIR)/vsi_nn_vxkernel.o
-
 vpath %.c utils
 OBJECTS +=   $(OBJ_DIR)/vsi_nn_code_generator.o   \
              $(OBJ_DIR)/vsi_nn_binary_tree.o   \
@@ -92,7 +89,8 @@ OBJECTS +=      $(OBJ_DIR)/vsi_nn_post_fasterrcnn.o \
                 $(OBJ_DIR)/vsi_nn_post_cmupose.o
 
 vpath %.c libnnext
-OBJECTS += $(OBJ_DIR)/vsi_nn_libnnext_resource.o
+OBJECTS += $(OBJ_DIR)/vsi_nn_libnnext_resource.o \
+			$(OBJ_DIR)/vsi_nn_vxkernel.o
 
 vpath %.c libnnext/ops/kernel
 SRCS += ${notdir ${wildcard libnnext/ops/kernel/*.c}}
@@ -118,8 +116,14 @@ SRCS += ${notdir ${wildcard kernel/vx/*.c}}
 vpath %.c custom/ops
 SRCS += ${notdir ${wildcard custom/ops/*.c}}
 
-vpath %.c custom/ops/kernel
-SRCS += ${notdir ${wildcard custom/ops/kernel/*.c}}
+vpath %.c custom/ops/kernel/evis
+SRCS += ${notdir ${wildcard custom/ops/kernel/evis/*.c}}
+
+vpath %.c custom/ops/kernel/cl
+SRCS += ${notdir ${wildcard custom/ops/kernel/cl/*.c}}
+
+vpath %.c custom/ops/kernel/cpu
+SRCS += ${notdir ${wildcard custom/ops/kernel/cpu/*.c}}
 
 OBJECTS +=  ${patsubst %.c, $(OBJ_DIR)/%.o, $(SRCS)}
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c
index 0a260d2..c96a2c8 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c
@@ -217,6 +217,7 @@ static vsi_bool op_check
         IO_TYPE(D_I16|Q_DFP, D_U8)
         IO_TYPE(D_I16|Q_DFP, D_I16)
         IO_TYPE(D_F32, D_I32)
+        IO_TYPE(D_F32, D_I16)
         IO_TYPE(D_F16, D_I32)
         IO_TYPE(D_I32, D_I32)
         IO_TYPE(D_I8|Q_DFP,   D_I32)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c b/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c
index 83b3664..2324875 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c
@@ -34,159 +34,11 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "kernel/vsi_nn_kernel.h"
 
-#define _ARG_NUM            (0)
 #define _INPUT_NUM          (4)
 #define _OUTPUT_NUM         (1)
 #define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
-#define _PARAM_NUM          (_ARG_NUM + _IO_NUM)
-
-extern vx_kernel_description_t * vx_kernel_AXIS_ALIGNED_BBOX_TRANSFORM_list[];
-
-static void _set_inputs_outputs
-    (
-    vx_reference * params,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    uint32_t i;
-    uint32_t cnt;
-
-    /* Set inputs */
-    cnt = 0;
-    for( i = 0; i < _INPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)inputs[i]->t;
-    }
-
-    /* Set outputs */
-    for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)outputs[i]->t;
-    }
-} /* _set_inputs_outputs() */
-
-static vsi_status _create_params
-    (
-    vsi_nn_node_t * node,
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-#if 0
-    vx_context ctx;
-    vsi_nn_axis_aligned_bbox_transform_param * p;
-    if( 0 == num )
-    {
-        return VSI_SUCCESS;
-    }
-    memset( params, 0, sizeof( vx_reference * ) * num );
-    p = &(node->nn_param.axis_aligned_bbox_transform);
-    ctx = vxGetContext( (vx_reference)node->graph->g );
-    /* Init parameters */
-    #define _SET_PARAM( i, type, arg ) do{ \
-        params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \
-        status = vxGetStatus( params[i] ); \
-        if( VSI_SUCCESS != status ) { \
-            goto set_param_error; \
-            } \
-        } while(0)
-    _SET_PARAM( 0, VX_TYPE_INT32, type );
-    #undef _SET_PARAM
-set_param_error:
-#endif
-    return status;
-} /* _create_params */
-
-static void _release_params
-    (
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    uint32_t i;
-    vx_scalar scalar;
-    for( i = 0; i < num; i ++ )
-    {
-        scalar = (vx_scalar)params[i];
-        vxReleaseScalar( &scalar );
-    }
-} /* _release_params() */
-
-static vsi_status cpu_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_reference * args;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    return status;
-}
-
-static vsi_status vx_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_reference * args;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-    /*TODO: Add code if need to change your parameter*/
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    return status;
-}
-
-static vsi_nn_op_compute_t op_compute_list[] =
-{
-    cpu_op_compute,
-    vx_op_compute,
-    NULL
-};
 
 static vsi_status op_compute
     (
@@ -195,46 +47,18 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status;
-    vsi_nn_kernel_info_t kernel_info;
-    char *path = NULL;
+    vsi_status status = VSI_FAILURE;
 
-    memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t));
-    status = VSI_FAILURE;
-    kernel_info.type = VX_KERNEL_TYPE_CPU;
-    kernel_info.kernel = vx_kernel_AXIS_ALIGNED_BBOX_TRANSFORM_list;
-    kernel_info.resource_num = 1;
-    kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *));
-    kernel_info.resource_name[0] = "vsi_nn_kernel_axis_aligned_bbox_transform";
-    path = getenv("USER_VX_SOURCE_PATH");
-    if(path)
-        vsi_nn_VxResourceSetPath(path);
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+        "axis_aligned_bbox_transform",
+        inputs, _INPUT_NUM,
+        outputs, _OUTPUT_NUM, NULL );
 
-    if( kernel_info.type == VX_KERNEL_TYPE_VX)
+    if ( self->n )
     {
-        kernel_info.kernel_index = 1;
-        kernel_info.init_index = 1;
-    }
-    else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/
-    {
-        kernel_info.kernel_index = 0;
-        kernel_info.init_index = 0;
+        status = VSI_SUCCESS;
     }
 
-    self->n = vsi_nn_RegisterClientKernelAndNewNode(
-            self->graph, &kernel_info);
-    if (kernel_info.resource_name)
-    {
-        free(kernel_info.resource_name);
-    }
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-    if (NULL != op_compute_list[kernel_info.init_index])
-    {
-        status = op_compute_list[kernel_info.init_index](self, inputs, outputs);
-    }
     return status;
 } /* op_compute() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
index 64b94ee..ed63df6 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
@@ -33,6 +33,8 @@
 #include "vsi_nn_log.h"
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
 
 static vsi_status _try_set_high_presision_tensor
     (
@@ -107,7 +109,21 @@ static vsi_bool _is_3d_batchnorm
     }
 }
 
-static vsi_status op_compute
+static vsi_bool _is_dynamic_batchnorm
+    (
+    vsi_nn_tensor_t ** inputs
+    )
+{
+    uint32_t i = 0;
+    for (i = 1; i < 5 ; i++) {
+        if (FALSE == inputs[i]->attr.is_const) {
+            return TRUE;
+        }
+    }
+    return FALSE;
+}
+
+static vsi_status _static_batchnorm
     (
     vsi_nn_node_t * self,
     vsi_nn_tensor_t ** inputs,
@@ -150,6 +166,115 @@ static vsi_status op_compute
         status = VSI_FAILURE;
     }
     return status;
+}
+
+static vsi_status _dynamic_batchnorm
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t * param = NULL;
+    int32_t  shapes[4][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
+    int32_t* shapes_ptr[4] = {NULL};
+    int32_t  *shapes_in[3] = {NULL};
+    size_t rank_in[3] = {0};
+    uint32_t new_rank = 0;
+    vsi_nn_tensor_t* reshape_tensors[6] = { NULL };
+    vsi_bool ret = TRUE;
+    uint32_t i = 0;
+
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_float32( param, "eps", self->nn_param.batch_norm.eps );
+
+    rank_in[0] = (size_t)inputs[0]->attr.dim_num;
+    rank_in[1] = (size_t)inputs[1]->attr.dim_num;
+    rank_in[2] = (size_t)inputs[3]->attr.dim_num;
+    shapes_in[0] = (int32_t *)inputs[0]->attr.size;
+    shapes_in[1] = (int32_t *)inputs[1]->attr.size;
+    shapes_in[2] = (int32_t *)inputs[3]->attr.size;
+    for (i = 0; i < 4; i++)
+    {
+        shapes_ptr[i] = shapes[i];
+    }
+
+    ret = vsi_nn_kernel_optimize_broadcast_shape(
+            (const int32_t**)shapes_in, (const size_t*)rank_in, 3,
+            (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num,
+            shapes_ptr, shapes[3], &new_rank);
+
+    if( ret )
+    {
+        reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
+            inputs[0], (uint32_t*)shapes[0], new_rank );
+        reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
+            inputs[1], (uint32_t*)shapes[1], new_rank );
+        reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph,
+            inputs[2], (uint32_t*)shapes[1], new_rank );
+        reshape_tensors[3] = vsi_nn_reshape_tensor( self->graph,
+            inputs[3], (uint32_t*)shapes[2], new_rank );
+        reshape_tensors[4] = vsi_nn_reshape_tensor( self->graph,
+            inputs[4], (uint32_t*)shapes[2], new_rank );
+
+        reshape_tensors[5] = vsi_nn_reshape_tensor( self->graph,
+            outputs[0], (uint32_t*)shapes[3], new_rank );
+    }
+    else
+    {
+        reshape_tensors[0] = inputs[0];
+        reshape_tensors[1] = inputs[1];
+        reshape_tensors[2] = inputs[2];
+        reshape_tensors[3] = inputs[3];
+        reshape_tensors[4] = inputs[4];
+
+        reshape_tensors[5] = outputs[0];
+    }
+
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+        "batchnorm_single",
+        reshape_tensors, 5,
+        &reshape_tensors[5], 1, param );
+
+    if( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    if (ret)
+    {
+        for ( i = 0; i < 6; i++)
+        {
+            if (reshape_tensors[i])
+            {
+                vsi_nn_ReleaseTensor( &reshape_tensors[i] );
+            }
+        }
+    }
+
+    vsi_nn_kernel_param_release( &param );
+
+    return status;
+}
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status;
+    if (_is_dynamic_batchnorm(inputs))
+    {
+        status = _dynamic_batchnorm(self, inputs, outputs);
+    }
+    else
+    {
+        status = _static_batchnorm(self, inputs, outputs);
+    }
+    return status;
 } /* op_compute() */
 
 static vsi_status op_optimize
@@ -204,7 +329,62 @@ static vsi_status op_optimize
     return VSI_SUCCESS;
 } /* op_optimize() */
 
-static vsi_bool op_check
+static vsi_bool _dynamic_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    /*TODO: Check tensor shapes. */
+    uint32_t i = 0;
+    uint32_t j = 0;
+    uint32_t rank = inputs[0]->attr.dim_num;
+
+    /* check inputs outputs data type */
+    BEGIN_IO_TYPE_DECL(BATCHNORM_SINGLE, 5, 1)
+        IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM)
+        IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP)
+        IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP)
+        IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_F16)
+        IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F16, D_F32, D_F16)
+        IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_F16)
+        IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_F16)
+        IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32)
+        IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_F16)
+        IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_U8|Q_ASYM)
+    END_IO_TYPE_DECL(BATCHNORM_SINGLE)
+    if(!VALIDATE_OP_IO_TYPES(BATCHNORM_SINGLE, self, inputs, self->input.num, outputs, self->output.num)) {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    for(i = 0; i < rank; i++)
+    {
+        vx_int32 shape0 = inputs[0]->attr.size[i];
+
+        for ( j = 1; j < self->input.num; j++)
+        {
+            uint32_t rank1 = inputs[j]->attr.dim_num;
+            vx_int32 shape1 = rank1 > i ? inputs[j]->attr.size[i] : 1;
+
+            if(shape0 != shape1 && shape1 != 1)
+            {
+                VSILOGE("Invalid broadcast for inputs[%d] size[%u]", j, shape1);
+                return FALSE;
+            }
+        }
+    }
+    return TRUE;
+}
+
+static vsi_bool _static_check
     (
     vsi_nn_node_t * self,
     vsi_nn_tensor_t ** inputs,
@@ -240,8 +420,26 @@ static vsi_bool op_check
         return FALSE;
     }
     return TRUE;
+}
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    if (_is_dynamic_batchnorm(inputs))
+    {
+        return _dynamic_check(self, inputs, outputs);
+    }
+    else
+    {
+        return _static_check(self, inputs, outputs);
+    }
 } /* op_check() */
 
+
 static vsi_bool op_setup
     (
     vsi_nn_node_t * self,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c
index a945e61..a773a5b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c
@@ -148,18 +148,28 @@ static vsi_bool op_check
 
     /* check inputs outputs data type */
     BEGIN_IO_TYPE_DECL(BATCHNORM_SINGLE, 5, 1)
-        IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM)
-        IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP)
-        IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP)
-        IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_F16)
+        IO_TYPE(D_F16,       D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM)
+        IO_TYPE(D_F16,       D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP)
+        IO_TYPE(D_F16,       D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP)
+        IO_TYPE(D_F16,       D_F16, D_F16, D_F16, D_F32, D_F16)
         IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F16, D_F32, D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F16, D_F32, D_F16)
-        IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_F16)
+        IO_TYPE(D_I8|Q_DFP,  D_F16, D_F16, D_F16, D_F32, D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,  D_F16, D_F16, D_F16, D_F32, D_F16)
         IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_I16|Q_DFP)
         IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F16, D_F32, D_F16)
-        IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32)
-        IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_F16)
+        IO_TYPE(D_F16,       D_F16, D_F16, D_F32, D_F32, D_U8|Q_ASYM)
+        IO_TYPE(D_F16,       D_F16, D_F16, D_F32, D_F32, D_I16|Q_DFP)
+        IO_TYPE(D_F16,       D_F16, D_F16, D_F32, D_F32, D_I8|Q_DFP)
+        IO_TYPE(D_F16,       D_F16, D_F16, D_F32, D_F32, D_F16)
+        IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F32, D_F32, D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F32, D_F32, D_F16)
+        IO_TYPE(D_I8|Q_DFP,  D_F16, D_F16, D_F32, D_F32, D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,  D_F16, D_F16, D_F32, D_F32, D_F16)
+        IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F32, D_F32, D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F32, D_F32, D_F16)
+        IO_TYPE(D_F32,       D_F32, D_F32, D_F32, D_F32, D_F32)
+        IO_TYPE(D_F16,       D_F32, D_F32, D_F32, D_F32, D_F16)
         IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_U8|Q_ASYM)
     END_IO_TYPE_DECL(BATCHNORM_SINGLE)
     if(!VALIDATE_OP_IO_TYPES(BATCHNORM_SINGLE, self, inputs, self->input.num, outputs, self->output.num)) {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c
index b490e0e..fdc508b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "vsi_nn_internal_node.h"
 #include "vsi_nn_rnn_helper.h"
 
@@ -380,20 +380,22 @@ static vsi_bool op_setup
         vsi_nn_tensor_t* lstmcell_out2 = NULL;
 
         /* lstmcell output */
+        /* if merge_outputs is true, there will be only 1 output, so use the attr
+           of the fw for the bw, since they are always same as each other.*/
         vsi_nn_internal_init_tensor_attr(&attr,
-            &outputs[BI_LSTM_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
+            &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
         lstmcell_out0 = output_tensor->t;
 
         /* lstmcell output h_state */
         vsi_nn_internal_init_tensor_attr(&attr,
-            &outputs[BI_LSTM_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
+            &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
         lstmcell_out1 = output_tensor->t;
 
         /* lstmcell output c_state */
         vsi_nn_internal_init_tensor_attr(&attr,
-            &outputs[BI_LSTM_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
+            &outputs[BI_LSTM_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
         lstmcell_out2 = output_tensor->t;
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c
index 710acf9..cb85606 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "vsi_nn_internal_node.h"
 #include "vsi_nn_rnn_helper.h"
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c b/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c
index fb6b0e1..d376212 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_box_with_nms_limit.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 #define _ARG_NUM            (6)
 #define _INPUT_NUM          (3)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c
index 25a8787..34eb9cf 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c
@@ -128,56 +128,77 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(CAST, 1, 1)
-        IO_TYPE(D_F32,  D_F32)
-        IO_TYPE(D_F32,  D_I32)
-        IO_TYPE(D_F32,  D_U32)
-        IO_TYPE(D_F32,  D_F16)
-        IO_TYPE(D_F32,  D_BOOL8)
-        IO_TYPE(D_I32,  D_F32)
-        IO_TYPE(D_I32,  D_I32)
-        IO_TYPE(D_I32,  D_U32)
-        IO_TYPE(D_I32,  D_F16)
-        IO_TYPE(D_I32,  D_BOOL8)
-        IO_TYPE(D_U32,  D_F32)
-        IO_TYPE(D_U32,  D_I32)
-        IO_TYPE(D_U32,  D_U32)
-        IO_TYPE(D_U32,  D_BOOL8)
-        IO_TYPE(D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_F16,  D_BOOL8)
+        IO_TYPE(D_F32,        D_F32)
+        IO_TYPE(D_F32,        D_I32)
+        IO_TYPE(D_F32,        D_U32)
+        IO_TYPE(D_F32,        D_F16)
+        IO_TYPE(D_F32,        D_BOOL8)
+        IO_TYPE(D_I32,        D_F32)
+        IO_TYPE(D_I32,        D_I32)
+        IO_TYPE(D_I32,        D_U32)
+        IO_TYPE(D_I32,        D_F16)
+        IO_TYPE(D_I32,        D_BOOL8)
+        IO_TYPE(D_BOOL8,      D_F32)
+        IO_TYPE(D_BOOL8,      D_I32)
+        IO_TYPE(D_BOOL8,      D_U32)
+        IO_TYPE(D_U32,        D_F32)
+        IO_TYPE(D_U32,        D_I32)
+        IO_TYPE(D_U32,        D_U32)
+        IO_TYPE(D_U32,        D_BOOL8)
+        IO_TYPE(D_F16,        D_I16|Q_DFP)
+        IO_TYPE(D_F16,        D_I8|Q_DFP)
+        IO_TYPE(D_F16,        D_U8|Q_ASYM)
+        IO_TYPE(D_F16,        D_BOOL8)
         IO_TYPE(D_I16|Q_DFP,  D_F16)
         IO_TYPE(D_I16|Q_DFP,  D_I8|Q_DFP)
         IO_TYPE(D_I16|Q_DFP,  D_U8|Q_ASYM)
-        IO_TYPE(D_I16,  D_BOOL8)
-        IO_TYPE(D_I8|Q_DFP,  D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM)
-        IO_TYPE(D_I8,  D_BOOL8)
+        IO_TYPE(D_I16|Q_DFP,  D_BOOL8)
+        IO_TYPE(D_I16,        D_F16)
+        IO_TYPE(D_I16,        D_I8|Q_DFP)
+        IO_TYPE(D_I16,        D_U8|Q_ASYM)
+        IO_TYPE(D_I16,        D_BOOL8)
+        IO_TYPE(D_I8|Q_DFP,   D_F16)
+        IO_TYPE(D_I8|Q_DFP,   D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,   D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_DFP,   D_BOOL8)
+        IO_TYPE(D_I8,         D_F16)
+        IO_TYPE(D_I8,         D_I16|Q_DFP)
+        IO_TYPE(D_I8,         D_U8|Q_ASYM)
+        IO_TYPE(D_I8,         D_BOOL8)
         IO_TYPE(D_U8|Q_ASYM,  D_F16)
         IO_TYPE(D_U8|Q_ASYM,  D_I16|Q_DFP)
         IO_TYPE(D_U8|Q_ASYM,  D_I8|Q_DFP)
-        IO_TYPE(D_U8,  D_BOOL8)
-        IO_TYPE(D_F32,  D_I16|Q_DFP)
-        IO_TYPE(D_F32,  D_I8|Q_DFP)
-        IO_TYPE(D_F32,  D_U8|Q_ASYM)
-        IO_TYPE(D_I32,  D_I16|Q_DFP)
-        IO_TYPE(D_I32,  D_I8|Q_DFP)
-        IO_TYPE(D_I32,  D_U8|Q_ASYM)
-        IO_TYPE(D_F16,  D_F32)
-        IO_TYPE(D_F16,  D_I32)
-        IO_TYPE(D_F16,  D_U8)
-        IO_TYPE(D_F16,  D_I8)
-        IO_TYPE(D_F16,  D_F16)
-        IO_TYPE(D_BOOL8,  D_F16)
-        IO_TYPE(D_BOOL8,  D_BOOL8)
+        IO_TYPE(D_U8|Q_ASYM,  D_BOOL8)
+        IO_TYPE(D_U8,         D_F16)
+        IO_TYPE(D_U8,         D_I16|Q_DFP)
+        IO_TYPE(D_U8,         D_I8|Q_DFP)
+        IO_TYPE(D_U8,         D_BOOL8)
+        IO_TYPE(D_F32,        D_I16|Q_DFP)
+        IO_TYPE(D_F32,        D_I8|Q_DFP)
+        IO_TYPE(D_F32,        D_U8|Q_ASYM)
+        IO_TYPE(D_I32,        D_I16|Q_DFP)
+        IO_TYPE(D_I32,        D_I8|Q_DFP)
+        IO_TYPE(D_I32,        D_U8|Q_ASYM)
+        IO_TYPE(D_F16,        D_F32)
+        IO_TYPE(D_F16,        D_I32)
+        IO_TYPE(D_F16,        D_I16)
+        IO_TYPE(D_F16,        D_U8)
+        IO_TYPE(D_F16,        D_I8)
+        IO_TYPE(D_F16,        D_F16)
+        IO_TYPE(D_BOOL8,      D_F16)
+        IO_TYPE(D_BOOL8,      D_I16|Q_DFP)
+        IO_TYPE(D_BOOL8,      D_I8|Q_DFP)
+        IO_TYPE(D_BOOL8,      D_U8|Q_ASYM)
+        IO_TYPE(D_BOOL8,      D_BOOL8)
+        IO_TYPE(D_BOOL8,      D_I16)
+        IO_TYPE(D_BOOL8,      D_I8)
+        IO_TYPE(D_BOOL8,      D_U8)
         IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
         IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
         IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
         IO_TYPE(D_U8|Q_ASYM,  D_F32)
         IO_TYPE(D_U8|Q_ASYM,  D_I32)
-        IO_TYPE(D_BF16,   D_BF16)
-        IO_TYPE(D_U8,   D_F16)
+        IO_TYPE(D_BF16,       D_BF16)
     END_IO_TYPE_DECL(CAST)
     if(!VALIDATE_OP_IO_TYPES(CAST, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_ceil.c b/src/tim/vx/internal/src/ops/vsi_nn_op_ceil.c
new file mode 100644
index 0000000..69dbfd5
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_ceil.c
@@ -0,0 +1,110 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_platform.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(CEIL, 1, 1)
+        IO_TYPE(D_BF16, D_BF16)
+        IO_TYPE(D_F32,  D_F32)
+        IO_TYPE(D_F16,  D_F16)
+        IO_TYPE(D_F16,  D_U8|Q_ASYM)
+        IO_TYPE(D_F16,  D_I16|Q_DFP)
+        IO_TYPE(D_F16,  D_I8|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM,  D_F16)
+        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,  D_F16)
+        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,  D_F16)
+        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP)
+    END_IO_TYPE_DECL(CEIL)
+    if (!VALIDATE_OP_IO_TYPES(CEIL, self, inputs, self->input.num, outputs, self->output.num))
+    {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    vx_nn_rounding_params_t p;
+
+    memset(&p, 0, sizeof(p));
+    p.mode = VX_NN_DS_SIZE_ROUNDING_CEILING;
+    self->n = vxTensorRoundingNode(self->graph->g, inputs[0]->t, &p, sizeof(p), outputs[0]->t);
+    if ( !self->n )
+    {
+        status = VSI_FAILURE;
+    }
+
+    return status;
+} /* op_compute() */
+
+#ifdef __cpluplus
+extern "C" {
+#endif
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ CEIL,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ vsi_nn_op_common_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ 1,
+    /* output_num */ 1
+    );
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
index 06898c1..9c151f6 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "vsi_nn_internal_node.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
@@ -127,6 +127,7 @@ static vsi_bool op_check
         IO_TYPE(D_I8|Q_DFP,   D_F16)
         IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
         IO_TYPE(D_I16|Q_DFP,  D_F16)
+        IO_TYPE(D_BF16,       D_BF16)
     END_IO_TYPE_DECL(CLIP)
     if(!VALIDATE_OP_IO_TYPES(CLIP, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c
index 2d6f510..f902832 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c
@@ -33,6 +33,12 @@
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
+typedef struct _conv1d_local_data_t {
+    vsi_bool use_ext_pad;
+    vsi_bool use_ovxlib_kernel;
+    vsi_nn_internal_tensor_t* pad_output;
+} conv1d_local_data_t;
+
 static vsi_status op_compute
     (
     vsi_nn_node_t * self,
@@ -45,21 +51,110 @@ static vsi_status op_compute
 
     param = vsi_nn_kernel_param_create();
 
-    vsi_nn_kernel_param_add_int32( param, "stride", self->nn_param.conv1d.stride );
-    vsi_nn_kernel_param_add_int32( param, "pad_front", self->nn_param.conv1d.pad[0] );
-    vsi_nn_kernel_param_add_int32( param, "pad_end", self->nn_param.conv1d.pad[1] );
-    vsi_nn_kernel_param_add_int32( param, "dilation", self->nn_param.conv1d.dilation);
-    vsi_nn_kernel_param_add_int32( param, "overflow_policy", self->vx_param.overflow_policy );
-    vsi_nn_kernel_param_add_int32( param, "rounding_policy", self->vx_param.rounding_policy );
-    vsi_nn_kernel_param_add_int32( param,
-            "down_scale_size_rounding", self->vx_param.down_scale_size_rounding );
-    self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "conv1d",
-            inputs, 3, outputs, 1, param );
+    if(self->nn_param.conv1d.local->use_ovxlib_kernel)
+    {
+        vsi_nn_tensor_t* new_inputs[3] = { NULL };
+        vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+        uint32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+        int32_t new_rank = 0;
+        int32_t pad_front = self->nn_param.conv1d.pad[0];
+        int32_t pad_end   = self->nn_param.conv1d.pad[1];
+
+        if (1 == inputs[0]->attr.dim_num)
+        {
+            shape[0] = inputs[0]->attr.size[0];
+            shape[1] = 1;
+            new_rank = 2;
+            reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
+                    inputs[0], (uint32_t*)shape, new_rank );
+            new_inputs[0] = reshape_tensors[0];
+        }
+        else
+        {
+            new_inputs[0] = inputs[0];
+        }
+
+        if (1 == inputs[1]->attr.dim_num)
+        {
+            shape[0] = inputs[1]->attr.size[0];
+            shape[1] = 1;
+            new_rank = 2;
+            reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
+                    inputs[1], (uint32_t*)shape, new_rank );
+            new_inputs[1] = reshape_tensors[1];
+        }
+        else
+        {
+            new_inputs[1] = inputs[1];
+        }
+
+        if (1 == inputs[2]->attr.dim_num)
+        {
+            shape[0] = inputs[2]->attr.size[0];
+            shape[1] = 1;
+            new_rank = 2;
+            reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph,
+                    inputs[2], (uint32_t*)shape, new_rank );
+            new_inputs[2] = reshape_tensors[2];
+        }
+        else
+        {
+            new_inputs[2] = inputs[2];
+        }
+
+        /* overwrite input[0] with padded tensor */
+        if(self->nn_param.conv1d.local->use_ext_pad)
+        {
+            vsi_nn_internal_compute_node( self );
+            new_inputs[0] = self->nn_param.conv1d.local->pad_output->t;
+            pad_front = 0;
+            pad_end   = 0;
+        }
+
+        vsi_nn_kernel_param_add_int32( param, "stride", self->nn_param.conv1d.stride );
+        vsi_nn_kernel_param_add_int32( param, "pad_front", pad_front );
+        vsi_nn_kernel_param_add_int32( param, "pad_end", pad_end );
+        vsi_nn_kernel_param_add_int32( param, "dilation", self->nn_param.conv1d.dilation );
+        vsi_nn_kernel_param_add_int32( param, "overflow_policy", self->vx_param.overflow_policy );
+        vsi_nn_kernel_param_add_int32( param, "rounding_policy", self->vx_param.rounding_policy );
+        vsi_nn_kernel_param_add_int32( param,
+                "down_scale_size_rounding", self->vx_param.down_scale_size_rounding );
+        self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "conv1d_ovxlib",
+                new_inputs, 3, outputs, 1, param );
+
+        if (reshape_tensors[0]) vsi_nn_ReleaseTensor( &reshape_tensors[0] );
+        if (reshape_tensors[1]) vsi_nn_ReleaseTensor( &reshape_tensors[1] );
+        if (reshape_tensors[2]) vsi_nn_ReleaseTensor( &reshape_tensors[2] );
+    }
+    else
+    {
+        vsi_nn_kernel_param_add_int32( param, "stride", self->nn_param.conv1d.stride );
+        vsi_nn_kernel_param_add_int32( param, "pad_front", self->nn_param.conv1d.pad[0] );
+        vsi_nn_kernel_param_add_int32( param, "pad_end", self->nn_param.conv1d.pad[1] );
+        vsi_nn_kernel_param_add_int32( param, "dilation", self->nn_param.conv1d.dilation );
+        vsi_nn_kernel_param_add_int32( param, "overflow_policy", self->vx_param.overflow_policy );
+        vsi_nn_kernel_param_add_int32( param, "rounding_policy", self->vx_param.rounding_policy );
+        vsi_nn_kernel_param_add_int32( param,
+                "down_scale_size_rounding", self->vx_param.down_scale_size_rounding );
+        if( self->nn_param.conv1d.multiplier > 0 )
+        {
+            vsi_nn_kernel_param_add_int32( param, "multiplier",
+                    self->nn_param.conv1d.multiplier );
+            self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "depthwise_conv1d",
+                inputs, 3, outputs, 1, param );
+        }
+        else
+        {
+            self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "conv1d",
+                inputs, 3, outputs, 1, param );
+        }
+    }
 
     if( self->n )
     {
         status = VSI_SUCCESS;
     }
+
     vsi_nn_kernel_param_release( &param );
     return status;
 } /* op_compute() */
@@ -151,6 +246,51 @@ static vsi_bool op_setup
         outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
         outputs[0]->attr.dim_num = 3;
     }
+
+    if ( (self->nn_param.conv1d.ksize == 1024 && self->nn_param.conv1d.dilation == 1)
+      || (self->nn_param.conv1d.ksize == 3 && self->nn_param.conv1d.dilation > 7) )
+    {
+        if (self->nn_param.conv1d.stride == 1 && self->nn_param.conv1d.multiplier == 0)
+        {
+            self->nn_param.conv1d.local->use_ovxlib_kernel = TRUE;
+            if ((p->pad[0] || p->pad[1]) && (inputs[0]->attr.size[0] >= 65535))
+            {
+                vsi_nn_tensor_attr_t attr;
+                vsi_nn_internal_node_t* curr = NULL;
+                vsi_nn_internal_tensor_t* tensor = NULL;
+                uint32_t *front_data = NULL;
+                uint32_t *back_data  = NULL;
+                memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
+                vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, TRUE);
+                tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+
+                curr = vsi_nn_internal_new_node(self, VSI_NN_OP_PAD, 0, 0);
+                front_data = (uint32_t*)\
+                    vsi_nn_internal_new_node_param(curr, sizeof(uint32_t) * inputs[0]->attr.dim_num);
+                back_data = (uint32_t*)\
+                    vsi_nn_internal_new_node_param(curr, sizeof(uint32_t) * inputs[0]->attr.dim_num);
+
+                front_data[0] = p->pad[0];
+                front_data[1] = 0;
+                front_data[2] = 0;
+                back_data[0]  = p->pad[1];
+                back_data[1]  = 0;
+                back_data[2]  = 0;
+                curr->node->nn_param.pad.front_size    = front_data;
+                curr->node->nn_param.pad.back_size     = back_data;
+                curr->node->nn_param.pad.dim_num       = 3;
+                curr->node->nn_param.pad.const_val     = 0;
+                curr->node->nn_param.pad.mode          = VSI_NN_PAD_MODE_CONSTANT;
+                curr->inputs[0]                        = inputs[0];
+                curr->outputs[0]                       = tensor->t;
+                vsi_nn_internal_setup_node(self, curr);
+
+                self->nn_param.conv1d.local->use_ext_pad = TRUE;
+                self->nn_param.conv1d.local->pad_output = tensor;
+            }
+        }
+    }
+
     return TRUE;
 } /* op_setup() */
 
@@ -159,9 +299,30 @@ static vsi_status op_deinit
     vsi_nn_node_t * self
     )
 {
+    vsi_nn_internal_deinit_node_wksp(self);
+
+    vsi_nn_safe_free(self->nn_param.gru_ovxlib.local);
+
     return vsi_nn_op_common_deinit(self);
 }
 
+static vsi_status op_init
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    vsi_nn_internal_init_node_wksp(self);
+
+    self->nn_param.conv1d.local = (conv1d_local_data_t *)malloc(sizeof(conv1d_local_data_t));
+    memset(self->nn_param.conv1d.local, 0x00, sizeof(conv1d_local_data_t));
+    self->nn_param.conv1d.local->use_ext_pad = FALSE;
+    self->nn_param.conv1d.local->use_ovxlib_kernel = FALSE;
+
+    return status;
+} /* op_init() */
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -169,7 +330,7 @@ extern "C" {
 DEF_OP_REG
     (
     /* op_name    */ CONV1D,
-    /* init       */ NULL,
+    /* init       */ op_init,
     /* compute    */ op_compute,
     /* deinit     */ op_deinit,
     /* check      */ op_check,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c
index fe9c4a3..bc8540d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c
@@ -118,6 +118,7 @@ static vsi_bool op_check
         BEGIN_IO_TYPE_DECL(CONV2D, 3, 1)
             /* IO_TYPE(INPUT, WEIGHT, BIAS, OUTPUT) */
             IO_TYPE(D_F32, D_F32, D_F32, D_F32)
+            IO_TYPE(D_F32, D_F32, D_F32, D_F16)
 
             IO_TYPE(D_F16, D_F16, D_F16, D_F16)
             IO_TYPE(D_F16, D_F16, D_F32, D_F16)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c b/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c
index cb1cda3..f99aa44 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
 #define _ARG_NUM            (3)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
index e86fe3d..79631e3 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
@@ -176,54 +176,117 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(DATACONVERT, 1, 1)
-        IO_TYPE(D_F16,  D_F16)
-        IO_TYPE(D_F16,  D_F32)
-        IO_TYPE(D_F16,  D_I32)
-        IO_TYPE(D_F16,  D_BF16)
-        IO_TYPE(D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_F32,  D_F32)
-        IO_TYPE(D_F32,  D_I32)
-        IO_TYPE(D_F32,  D_F16)
-        IO_TYPE(D_F32,  D_BF16)
-        IO_TYPE(D_F32,  D_I16|Q_DFP)
-        IO_TYPE(D_F32,  D_I8|Q_DFP)
-        IO_TYPE(D_F32,  D_U8|Q_ASYM)
+        IO_TYPE(D_F16,        D_F16)
+        IO_TYPE(D_F16,        D_F32)
+        IO_TYPE(D_F16,        D_I32)
+        IO_TYPE(D_F16,        D_U32)
+        IO_TYPE(D_F16,        D_BF16)
+        IO_TYPE(D_F16,        D_I16|Q_DFP)
+        IO_TYPE(D_F16,        D_I8|Q_DFP)
+        IO_TYPE(D_F16,        D_U8|Q_ASYM)
+        IO_TYPE(D_F16,        D_I16)
+        IO_TYPE(D_F16,        D_I8)
+        IO_TYPE(D_F16,        D_U8)
+        IO_TYPE(D_F32,        D_F32)
+        IO_TYPE(D_F32,        D_I32)
+        IO_TYPE(D_F32,        D_U32)
+        IO_TYPE(D_F32,        D_F16)
+        IO_TYPE(D_F32,        D_BF16)
+        IO_TYPE(D_F32,        D_I16|Q_DFP)
+        IO_TYPE(D_F32,        D_I8|Q_DFP)
+        IO_TYPE(D_F32,        D_U8|Q_ASYM)
+        IO_TYPE(D_F32,        D_I16)
+        IO_TYPE(D_F32,        D_I8)
+        IO_TYPE(D_F32,        D_U8)
+        IO_TYPE(D_I16|Q_DFP,  D_F32)
         IO_TYPE(D_I16|Q_DFP,  D_I32)
+        IO_TYPE(D_I16|Q_DFP,  D_U32)
         IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
         IO_TYPE(D_I16|Q_DFP,  D_I8|Q_DFP)
         IO_TYPE(D_I16|Q_DFP,  D_U8|Q_ASYM)
         IO_TYPE(D_I16|Q_DFP,  D_F16)
-        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_ASYM)
-        IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM)
-        IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,  D_I16)
+        IO_TYPE(D_I16|Q_DFP,  D_I8)
+        IO_TYPE(D_I16|Q_DFP,  D_U8)
+        IO_TYPE(D_I16,        D_F32)
+        IO_TYPE(D_I16,        D_I32)
+        IO_TYPE(D_I16,        D_U32)
+        IO_TYPE(D_I16,        D_I16|Q_DFP)
+        IO_TYPE(D_I16,        D_I8|Q_DFP)
+        IO_TYPE(D_I16,        D_U8|Q_ASYM)
+        IO_TYPE(D_I16,        D_F16)
+        IO_TYPE(D_I16,        D_I16)
+        IO_TYPE(D_I16,        D_I8)
+        IO_TYPE(D_I16,        D_U8)
+        IO_TYPE(D_I8|Q_DFP,   D_F32)
         IO_TYPE(D_I8|Q_DFP,   D_F16)
         IO_TYPE(D_I8|Q_DFP,   D_I32)
+        IO_TYPE(D_I8|Q_DFP,   D_U32)
+        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_DFP,   D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,   D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_DFP,   D_I8)
+        IO_TYPE(D_I8|Q_DFP,   D_I8)
+        IO_TYPE(D_I8|Q_DFP,   D_I16)
+        IO_TYPE(D_I8|Q_DFP,   D_U8)
+        IO_TYPE(D_I8,         D_F32)
+        IO_TYPE(D_I8,         D_F16)
+        IO_TYPE(D_I8,         D_I32)
+        IO_TYPE(D_I8,         D_U32)
+        IO_TYPE(D_I8,         D_I8|Q_DFP)
+        IO_TYPE(D_I8,         D_I8|Q_ASYM)
+        IO_TYPE(D_I8,         D_I16|Q_DFP)
+        IO_TYPE(D_I8,         D_U8|Q_ASYM)
+        IO_TYPE(D_I8,         D_I8)
+        IO_TYPE(D_I8,         D_I16)
+        IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,  D_I8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,  D_I8|Q_DFP)
         IO_TYPE(D_U8|Q_ASYM,  D_I16|Q_DFP)
         IO_TYPE(D_U8|Q_ASYM,  D_F16)
         IO_TYPE(D_U8|Q_ASYM,  D_I32)
+        IO_TYPE(D_U8|Q_ASYM,  D_U32)
         IO_TYPE(D_U8|Q_ASYM,  D_F32)
-        IO_TYPE(D_BOOL8,  D_BOOL8)
-        IO_TYPE(D_BOOL8,  D_U8|Q_ASYM)
-        IO_TYPE(D_BOOL8,  D_I8|Q_ASYM)
-        IO_TYPE(D_BOOL8,  D_I8|Q_DFP)
-        IO_TYPE(D_BOOL8,  D_I16|Q_DFP)
-        IO_TYPE(D_BF16,   D_BF16)
-        IO_TYPE(D_BF16,   D_F16)
-        IO_TYPE(D_BF16,   D_F32)
-        IO_TYPE(D_I32,   D_I32)
-        IO_TYPE(D_I32,   D_I16|Q_DFP)
-        IO_TYPE(D_I16, D_I16|Q_DFP)
-        IO_TYPE(D_I8,  D_I8|Q_DFP)
-        IO_TYPE(D_U8,  D_U8|Q_ASYM)
+        IO_TYPE(D_U8,         D_U8|Q_ASYM)
+        IO_TYPE(D_U8,         D_I16|Q_DFP)
+        IO_TYPE(D_U8,         D_F16)
+        IO_TYPE(D_U8,         D_I32)
+        IO_TYPE(D_U8,         D_U32)
+        IO_TYPE(D_U8,         D_F32)
+        IO_TYPE(D_BOOL8,      D_BOOL8)
+        IO_TYPE(D_BOOL8,      D_U8|Q_ASYM)
+        IO_TYPE(D_BOOL8,      D_I8|Q_ASYM)
+        IO_TYPE(D_BOOL8,      D_I8|Q_DFP)
+        IO_TYPE(D_BOOL8,      D_I16|Q_DFP)
+        IO_TYPE(D_BOOL8,      D_U8)
+        IO_TYPE(D_BOOL8,      D_I8)
+        IO_TYPE(D_BOOL8,      D_I8)
+        IO_TYPE(D_BOOL8,      D_I16)
+        IO_TYPE(D_BF16,       D_BF16)
+        IO_TYPE(D_BF16,       D_F16)
+        IO_TYPE(D_BF16,       D_F32)
+        IO_TYPE(D_I32,        D_I32)
+        IO_TYPE(D_I32,        D_I16|Q_DFP)
+        IO_TYPE(D_I32,        D_I16)
+        IO_TYPE(D_I32,        D_I8|Q_DFP)
+        IO_TYPE(D_I32,        D_I8)
+        IO_TYPE(D_I32,        D_U32)
+        IO_TYPE(D_I32,        D_U16)
+        IO_TYPE(D_I32,        D_U8|Q_ASYM)
+        IO_TYPE(D_I32,        D_U8)
+        IO_TYPE(D_U32,        D_U32)
+        IO_TYPE(D_U32,        D_I16|Q_DFP)
+        IO_TYPE(D_U32,        D_I16)
+        IO_TYPE(D_U32,        D_I8|Q_DFP)
+        IO_TYPE(D_U32,        D_I8)
+        IO_TYPE(D_U32,        D_I32)
+        IO_TYPE(D_U32,        D_U16)
+        IO_TYPE(D_U32,        D_U8|Q_ASYM)
+        IO_TYPE(D_U32,        D_U8)
     END_IO_TYPE_DECL(DATACONVERT)
     if (!VALIDATE_OP_IO_TYPES(DATACONVERT, self, inputs, self->input.num, outputs, self->output.num))
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
index 4e33da4..133141d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
@@ -33,6 +33,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "vsi_nn_log.h"
+#include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_constraint_check.h"
 
 #define COMPUTE_DECONV_SZ( in, ksize, pad_1, pad_2, stride, output_padding )\
@@ -181,6 +182,8 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    vsi_bool ret = FALSE;
+
     BEGIN_IO_TYPE_DECL(DECONVOLUTION, 3, 1)
         IO_TYPE(D_F16,  D_F16,  D_NONE, D_F16)
         IO_TYPE(D_F16,  D_F16,  D_F32, D_F16)
@@ -197,6 +200,8 @@ static vsi_bool op_check
         IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,  D_I32|Q_DFP, D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC, D_U8|Q_ASYM)
         IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC, D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE, D_I8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE, D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP)
@@ -225,7 +230,10 @@ static vsi_bool op_check
         return FALSE;
     }
 
-    return TRUE;
+    /* Check fl and scale*/
+    ret = vsi_nn_QuantCheck(inputs[0], inputs[1], inputs[2]);
+
+    return ret;
 } /* op_check() */
 
 static vsi_bool op_setup
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c
index cf8b2a7..3e971e7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c
@@ -33,6 +33,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "vsi_nn_log.h"
+#include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
 
 #define COMPUTE_DECONV_SZ( in, ksize, pad_1, pad_2, stride, output_padding )\
@@ -57,8 +58,24 @@ static vsi_status op_compute
     weight_attr.size[2] = weight_attr.size[1];
     weight_attr.size[1] = 1;
     weight_attr.dim_num = 4;
-    weight_tensor = vsi_nn_CreateTensor( self->graph, &weight_attr );
-    vsi_nn_ReshapeTensor( self->graph, inputs[1], weight_tensor, weight_attr.size, 4 );
+    if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
+    {
+        weight_tensor = vsi_nn_reshape_tensor( self->graph, inputs[1], weight_attr.size, 4 );
+    }
+    else
+    {
+        uint8_t    * data = NULL;
+        data = vsi_nn_ConvertTensorToData( self->graph, inputs[1] );
+        if (NULL == data)
+        {
+            VSILOGE("Convert data fail.\n");
+            status = VSI_FAILURE;
+            return status;
+        }
+        weight_attr.dtype.channel_dim = inputs[1]->attr.dtype.channel_dim + 1;
+        weight_tensor = vsi_nn_CreateTensorFromData(self->graph, data, &weight_attr);
+        vsi_nn_safe_free( data );
+    }
 
 #ifdef VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS
     if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 && TRUE == weight_tensor->attr.is_const )
@@ -118,8 +135,11 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
-    //TODO: Check tensor shapes.
-    return TRUE;
+    vsi_bool ret = FALSE;
+
+    ret = vsi_nn_OpCheck(VSI_NN_OP_DECONVOLUTION, self, inputs, outputs);
+
+    return ret;
 } /* op_check() */
 
 static vsi_bool op_setup
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c
index ea76c4b..f63db97 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dropout.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "vsi_nn_internal_node.h"
 #include "utils/vsi_nn_constraint_check.h"
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
index ea69316..528a72f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
@@ -195,19 +195,29 @@ static vsi_bool op_check_minimum
     /* check inputs outputs data type */
     BEGIN_IO_TYPE_DECL(MINIMUM, 2, 1)
         IO_TYPE(D_F16,  D_F16,  D_F16)
-        IO_TYPE(D_BF16, D_BF16, D_BF16)
-        IO_TYPE(D_F16,  D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP,   D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,   D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_F16,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16,  D_F16)
-        IO_TYPE(D_I16|Q_DFP,  D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_F16,  D_F16)
         IO_TYPE(D_F16,  D_F16,  D_I16|Q_DFP)
+        IO_TYPE(D_F16,  D_F16,  D_I8|Q_DFP)
+        IO_TYPE(D_F16,  D_F16,  D_U8|Q_ASYM)
+
+        IO_TYPE(D_F16,  D_I16|Q_DFP,  D_F16)
+        IO_TYPE(D_F16,  D_I16|Q_DFP,  D_I16|Q_DFP)
+        IO_TYPE(D_F16,  D_I8|Q_DFP,   D_F16)
+        IO_TYPE(D_F16,  D_I8|Q_DFP,   D_I8|Q_DFP)
+        IO_TYPE(D_F16,  D_U8|Q_ASYM,  D_F16)
+        IO_TYPE(D_F16,  D_U8|Q_ASYM,  D_U8|Q_ASYM)
+
+        IO_TYPE(D_I16|Q_DFP, D_F16, D_F16)
+        IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,  D_F16, D_F16)
+        IO_TYPE(D_I8|Q_DFP,  D_F16, D_I8|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16)
+        IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM)
+
+        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_I8|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
+
+        IO_TYPE(D_BF16, D_BF16, D_BF16)
         IO_TYPE(D_F32,  D_F32,  D_F32)
         IO_TYPE(D_I32,  D_I32,  D_I32)
     END_IO_TYPE_DECL(MINIMUM)
@@ -232,19 +242,29 @@ static vsi_bool op_check_maximum
     /* check inputs outputs data type */
     BEGIN_IO_TYPE_DECL(MAXIMUM, 2, 1)
         IO_TYPE(D_F16,  D_F16,  D_F16)
-        IO_TYPE(D_BF16, D_BF16, D_BF16)
-        IO_TYPE(D_F16,  D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP,   D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_U8|Q_ASYM)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,   D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_F16,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16,  D_F16)
-        IO_TYPE(D_I16|Q_DFP,  D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_F16,  D_F16)
         IO_TYPE(D_F16,  D_F16,  D_I16|Q_DFP)
+        IO_TYPE(D_F16,  D_F16,  D_I8|Q_DFP)
+        IO_TYPE(D_F16,  D_F16,  D_U8|Q_ASYM)
+
+        IO_TYPE(D_F16,  D_I16|Q_DFP,  D_F16)
+        IO_TYPE(D_F16,  D_I16|Q_DFP,  D_I16|Q_DFP)
+        IO_TYPE(D_F16,  D_I8|Q_DFP,   D_F16)
+        IO_TYPE(D_F16,  D_I8|Q_DFP,   D_I8|Q_DFP)
+        IO_TYPE(D_F16,  D_U8|Q_ASYM,  D_F16)
+        IO_TYPE(D_F16,  D_U8|Q_ASYM,  D_U8|Q_ASYM)
+
+        IO_TYPE(D_I16|Q_DFP, D_F16, D_F16)
+        IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,  D_F16, D_F16)
+        IO_TYPE(D_I8|Q_DFP,  D_F16, D_I8|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16)
+        IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM)
+
+        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_I8|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
+
+        IO_TYPE(D_BF16, D_BF16, D_BF16)
         IO_TYPE(D_F32,  D_F32,  D_F32)
         IO_TYPE(D_I32,  D_I32,  D_I32)
     END_IO_TYPE_DECL(MAXIMUM)
@@ -338,6 +358,8 @@ static vsi_bool op_check_add
         IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP,   D_I8|Q_DFP)
         IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP,   D_U8|Q_ASYM)
         IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_I16|Q_DFP)
+        IO_TYPE(D_I16,        D_I16|Q_DFP,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,  D_I16,        D_I16|Q_DFP)
         IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_F16)
         IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_F16)
@@ -359,6 +381,8 @@ static vsi_bool op_check_add
         IO_TYPE(D_F16,  D_F32,  D_F16)
         IO_TYPE(D_F16,  D_F16,  D_F32)
         IO_TYPE(D_I32,  D_I32,  D_I32)
+        IO_TYPE(D_I16,  D_I32,  D_I32)
+        IO_TYPE(D_I32,  D_I16,  D_I32)
         IO_TYPE(D_I32,  D_I32,  D_U8|Q_ASYM)
         IO_TYPE(D_I32,  D_I32,  D_I16|Q_DFP)
         IO_TYPE(D_I32,  D_I32,  D_I8|Q_DFP)
@@ -409,6 +433,8 @@ static vsi_bool op_check_sub
         IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP,   D_I8|Q_DFP)
         IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP,   D_U8|Q_ASYM)
         IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_I16|Q_DFP)
+        IO_TYPE(D_I16,        D_I16|Q_DFP,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,  D_I16,        D_I16|Q_DFP)
         IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_F16)
         IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_F16)
@@ -422,6 +448,8 @@ static vsi_bool op_check_sub
         IO_TYPE(D_F16,  D_F32,  D_F16)
         IO_TYPE(D_F16,  D_F16,  D_F32)
         IO_TYPE(D_I32,  D_I32,  D_I32)
+        IO_TYPE(D_I16,  D_I32,  D_I32)
+        IO_TYPE(D_I32,  D_I16,  D_I32)
     END_IO_TYPE_DECL(SUBTRACT)
     if(!VALIDATE_OP_IO_TYPES(SUBTRACT, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
@@ -446,27 +474,33 @@ static vsi_bool op_check_div
 {
     /* check inputs outputs data type */
     BEGIN_IO_TYPE_DECL(DIVIDE, 2, 1)
-        IO_TYPE(D_BF16, D_BF16, D_BF16)
-        IO_TYPE(D_F16,  D_F16,  D_F16)
-        IO_TYPE(D_F16,  D_I8|Q_DFP,   D_I8|Q_DFP)
-        IO_TYPE(D_F16,  D_I8|Q_DFP,   D_F16)
-        IO_TYPE(D_F16,  D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_F16,  D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_F16,  D_F16,  D_U8|Q_ASYM)
+        IO_TYPE(D_BF16,       D_BF16,       D_BF16)
+        IO_TYPE(D_F16,        D_F16,        D_F16)
+        IO_TYPE(D_F16,        D_F16,        D_I16|Q_DFP)
+        IO_TYPE(D_F16,        D_F16,        D_I8|Q_DFP)
+        IO_TYPE(D_F16,        D_F16,        D_U8|Q_ASYM)
+        IO_TYPE(D_F16,        D_I8|Q_DFP,   D_I8|Q_DFP)
+        IO_TYPE(D_F16,        D_I8|Q_DFP,   D_F16)
+        IO_TYPE(D_F16,        D_U8|Q_ASYM,  D_U8|Q_ASYM)
+        IO_TYPE(D_F16,        D_U8|Q_ASYM,  D_F16)
+        IO_TYPE(D_F16,        D_I16|Q_DFP,  D_I16|Q_DFP)
+        IO_TYPE(D_F16,        D_I16|Q_DFP,  D_F16)
         IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_F16)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16,  D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM,  D_F16,  D_F16)
+        IO_TYPE(D_U8|Q_ASYM,  D_F16,        D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,  D_F16,        D_F16)
         IO_TYPE(D_U8|Q_ASYM,  D_I16|Q_DFP,  D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,  D_I16|Q_DFP,  D_I16|Q_DFP)
         IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP,   D_I8|Q_DFP)
         IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP,   D_F16)
-        IO_TYPE(D_I8|Q_DFP,   D_F16,  D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,   D_F16,  D_F16)
+        IO_TYPE(D_I8|Q_DFP,   D_F16,        D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,   D_F16,        D_F16)
         IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_I16|Q_DFP)
+        IO_TYPE(D_I16,        D_I16|Q_DFP,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,  D_I16,        D_I16|Q_DFP)
         IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_F16)
-        IO_TYPE(D_I16|Q_DFP,  D_F16,  D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_F16,  D_F16)
+        IO_TYPE(D_I16|Q_DFP,  D_F16,        D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,  D_F16,        D_F16)
         IO_TYPE(D_F32,  D_F32,  D_F32)
         IO_TYPE(D_F32,  D_F32,  D_F16)
         IO_TYPE(D_F32,  D_F16,  D_F32)
@@ -475,6 +509,8 @@ static vsi_bool op_check_div
         IO_TYPE(D_F16,  D_F32,  D_F16)
         IO_TYPE(D_F16,  D_F16,  D_F32)
         IO_TYPE(D_I32,  D_I32,  D_I32)
+        IO_TYPE(D_I16,  D_I32,  D_I32)
+        IO_TYPE(D_I32,  D_I16,  D_I32)
     END_IO_TYPE_DECL(DIVIDE)
     if(!VALIDATE_OP_IO_TYPES(DIVIDE, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
@@ -521,6 +557,8 @@ static vsi_bool op_check_mul
         IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP,   D_F16)
         IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP,   D_I8|Q_DFP)
         IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_I16|Q_DFP)
+        IO_TYPE(D_I16,        D_I16|Q_DFP,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,  D_I16,        D_I16|Q_DFP)
         IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_F16)
         IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM,  D_F16)
@@ -538,6 +576,8 @@ static vsi_bool op_check_mul
         IO_TYPE(D_F16,  D_F32,  D_F16)
         IO_TYPE(D_F16,  D_F16,  D_F32)
         IO_TYPE(D_I32,  D_I32,  D_I32)
+        IO_TYPE(D_I16,  D_I32,  D_I32)
+        IO_TYPE(D_I32,  D_I16,  D_I32)
     END_IO_TYPE_DECL(MULTIPLY)
     if(!VALIDATE_OP_IO_TYPES(MULTIPLY, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
index c74da7a..d55ac92 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
@@ -196,6 +196,7 @@ DEF_ELEMENT_WISE_UNARY_OP( ELU, elu );
 DEF_ELEMENT_WISE_UNARY_OP( NEG, neg );
 DEF_ELEMENT_WISE_UNARY_OP( HARD_SIGMOID, hard_sigmoid );
 DEF_ELEMENT_WISE_UNARY_OP( MISH, mish );
+DEF_ELEMENT_WISE_UNARY_OP( ROUND, round );
 
 #undef DEF_ELEMENT_UNARY_WISE_OP
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_erf.c b/src/tim/vx/internal/src/ops/vsi_nn_op_erf.c
new file mode 100644
index 0000000..5da991f
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_erf.c
@@ -0,0 +1,128 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "erf",
+        inputs, 1, outputs, 1, NULL );
+
+    if( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(ERF, 1, 1)
+        /* IO_TYPE(INPUT, OUTPUT) */
+        IO_TYPE(D_F32, D_F32)
+        IO_TYPE(D_F32, D_F16)
+        IO_TYPE(D_F32, D_BF16)
+
+        IO_TYPE(D_F16, D_F32)
+        IO_TYPE(D_F16, D_F16)
+        IO_TYPE(D_F16, D_U8|Q_ASYM)
+        IO_TYPE(D_F16, D_I8|Q_DFP)
+        IO_TYPE(D_F16, D_I16|Q_DFP)
+
+        IO_TYPE(D_BF16, D_BF16)
+        IO_TYPE(D_BF16, D_F32)
+
+        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM, D_F16)
+
+        IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM, D_F16)
+
+        IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP, D_F16)
+
+        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP, D_F16)
+    END_IO_TYPE_DECL(ERF)
+    if (!VALIDATE_OP_IO_TYPES(ERF, self, inputs, self->input.num, outputs, self->output.num))
+    {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ ERF,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ vsi_nn_op_common_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c b/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c
index a3b7fb7..009e75d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
 #define _INPUT_NUM          (1)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c
index 9cd0bd2..4c19f01 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
 #include "utils/vsi_nn_constraint_check.h"
 
@@ -68,7 +68,6 @@ static vsi_status op_compute
 
     if( ret )
     {
-
         reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
                 inputs[0], (uint32_t*)shapes[0], new_rank );
         reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
@@ -105,6 +104,7 @@ static vsi_bool op_check
         IO_TYPE(D_F16,  D_F16,  D_F16)
         IO_TYPE(D_F32,  D_F32,  D_F32)
         IO_TYPE(D_I32,  D_I32,  D_I32)
+        IO_TYPE(D_I32,  D_I32,  D_U8|Q_ASYM)
         IO_TYPE(D_BF16, D_BF16, D_BF16)
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16)
@@ -112,6 +112,7 @@ static vsi_bool op_check
         IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_F16)
         IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP)
         IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16)
+        IO_TYPE(D_U8|Q_ASYM, D_I32,       D_U8|Q_ASYM)
     END_IO_TYPE_DECL(FLOORDIV)
     if(!VALIDATE_OP_IO_TYPES(FLOORDIV, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
@@ -189,4 +190,3 @@ DEF_OP_REG
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c
index b2023b6..7d95b97 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c
@@ -115,7 +115,12 @@ static vsi_bool op_check
     /* Check fl and scale*/
     ret = vsi_nn_QuantCheck(inputs[0], inputs[1], inputs[2]);
 
-    ret = ret && vsi_nn_OpCheck(VSI_NN_OP_FCL_RELU, self, inputs, outputs);
+    if (!ret)
+    {
+        return ret;
+    }
+
+    ret = vsi_nn_OpCheck(VSI_NN_OP_FCL_RELU, self, inputs, outputs);
 
     if(!ret) {
         /* check inputs outputs data type */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c
index d1af0cd..25e951c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c
@@ -33,7 +33,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "utils/vsi_nn_util.h"
 
 #define _ARG_NUM            (2)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
index 81af2ce..d373015 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
@@ -99,13 +99,15 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(GATHER, 2, 1)
-        IO_TYPE(D_I32,  D_I32,  D_I32)
-        IO_TYPE(D_F32,  D_I32,  D_F32)
-        IO_TYPE(D_F16, D_I32, D_U8|Q_ASYM)
-        IO_TYPE(D_F16, D_I32, D_I16|Q_DFP)
-        IO_TYPE(D_F16, D_I32, D_I8|Q_DFP)
-        IO_TYPE(D_F16, D_I32, D_F16)
+        IO_TYPE(D_I32,  D_I32, D_I32)
+        IO_TYPE(D_F32,  D_I32, D_F32)
+        IO_TYPE(D_F16,  D_I32, D_U8|Q_ASYM)
+        IO_TYPE(D_F16,  D_I32, D_I16|Q_DFP)
+        IO_TYPE(D_F16,  D_I32, D_I8|Q_DFP)
+        IO_TYPE(D_F16,  D_I32, D_F16)
+        IO_TYPE(D_BF16, D_I32, D_BF16)
         IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM)
+        IO_TYPE(D_U8, D_I32, D_U8)
         IO_TYPE(D_U8|Q_ASYM, D_I32, D_F16)
         IO_TYPE(D_I8|Q_DFP,  D_I32,  D_I8|Q_DFP)
         IO_TYPE(D_I8|Q_DFP,  D_I32,  D_F16)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c
index 9d5341a..6d55fad 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c
@@ -98,10 +98,11 @@ static vsi_bool op_check
     BEGIN_IO_TYPE_DECL(GATHER_ND, 2, 1)
         IO_TYPE(D_I32,  D_I32,  D_I32)
         IO_TYPE(D_F32,  D_I32,  D_F32)
-        IO_TYPE(D_F16, D_I32, D_U8|Q_ASYM)
-        IO_TYPE(D_F16, D_I32, D_I16|Q_DFP)
-        IO_TYPE(D_F16, D_I32, D_I8|Q_DFP)
-        IO_TYPE(D_F16, D_I32, D_F16)
+        IO_TYPE(D_F16,  D_I32,  D_U8|Q_ASYM)
+        IO_TYPE(D_F16,  D_I32,  D_I16|Q_DFP)
+        IO_TYPE(D_F16,  D_I32,  D_I8|Q_DFP)
+        IO_TYPE(D_F16,  D_I32,  D_F16)
+        IO_TYPE(D_BF16, D_I32,  D_BF16)
         IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM, D_I32, D_F16)
         IO_TYPE(D_I8|Q_DFP,  D_I32,  D_I8|Q_DFP)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c b/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c
index 927123b..cc42045 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c
@@ -34,163 +34,11 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "kernel/vsi_nn_kernel.h"
 
-#define _ARG_NUM            (6)
 #define _INPUT_NUM          (4)
 #define _OUTPUT_NUM         (3)
 #define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
-#define _PARAM_NUM          (_ARG_NUM + _IO_NUM)
-
-extern vx_kernel_description_t * vx_kernel_GENERATE_PROPOSALS_list[];
-
-static void _set_inputs_outputs
-    (
-    vx_reference * params,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    uint32_t i;
-    uint32_t cnt;
-
-    /* Set inputs */
-    cnt = 0;
-    for( i = 0; i < _INPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)inputs[i]->t;
-    }
-
-    /* Set outputs */
-    for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)outputs[i]->t;
-    }
-} /* _set_inputs_outputs() */
-
-static vsi_status _create_params
-    (
-    vsi_nn_node_t * node,
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    vsi_status status;
-    vx_context ctx;
-    vsi_nn_generate_proposals_param * p;
-    if( 0 == num )
-    {
-        return VSI_SUCCESS;
-    }
-    memset( params, 0, sizeof( vx_reference * ) * num );
-    p = &(node->nn_param.generate_proposals);
-    ctx = vxGetContext( (vx_reference)node->graph->g );
-    /* Init parameters */
-    #define _SET_PARAM( i, type, arg ) do{ \
-        params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \
-        status = vxGetStatus( params[i] ); \
-        if( VSI_SUCCESS != status ) { \
-            goto set_param_error; \
-            } \
-        } while(0)
-    _SET_PARAM( 0, VX_TYPE_FLOAT32, height_stride );
-    _SET_PARAM( 1, VX_TYPE_FLOAT32, width_stride );
-    _SET_PARAM( 2, VX_TYPE_INT32, pre_nms_top_n );
-    _SET_PARAM( 3, VX_TYPE_INT32, post_nms_top_n );
-    _SET_PARAM( 4, VX_TYPE_FLOAT32, iou_threshold );
-    _SET_PARAM( 5, VX_TYPE_FLOAT32, min_size );
-    #undef _SET_PARAM
-set_param_error:
-
-    return status;
-} /* _create_params */
-
-static void _release_params
-    (
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    uint32_t i;
-    vx_scalar scalar;
-    for( i = 0; i < num; i ++ )
-    {
-        scalar = (vx_scalar)params[i];
-        vxReleaseScalar( &scalar );
-    }
-} /* _release_params() */
-
-static vsi_status cpu_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_reference * args;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    return status;
-}
-
-static vsi_status vx_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_reference * args;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-    /*TODO: Add code if need to change your parameter*/
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    return status;
-}
-
-static vsi_nn_op_compute_t op_compute_list[] =
-{
-    cpu_op_compute,
-    vx_op_compute,
-    NULL
-};
 
 static vsi_status op_compute
     (
@@ -199,46 +47,27 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status;
-    vsi_nn_kernel_info_t kernel_info;
-    char *path = NULL;
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t * param = NULL;
 
-    memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t));
-    status = VSI_FAILURE;
-    kernel_info.type = VX_KERNEL_TYPE_CPU;
-    kernel_info.kernel = vx_kernel_GENERATE_PROPOSALS_list;
-    kernel_info.resource_num = 1;
-    kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *));
-    kernel_info.resource_name[0] = "vsi_nn_kernel_generate_proposals";
-    path = getenv("USER_VX_SOURCE_PATH");
-    if(path)
-        vsi_nn_VxResourceSetPath(path);
+    param = vsi_nn_kernel_param_create();
 
-    if( kernel_info.type == VX_KERNEL_TYPE_VX)
+    vsi_nn_kernel_param_add_float32( param, "height_stride", self->nn_param.generate_proposals.height_stride );
+    vsi_nn_kernel_param_add_float32( param, "width_stride", self->nn_param.generate_proposals.width_stride );
+    vsi_nn_kernel_param_add_int32( param, "pre_nms_top_n", self->nn_param.generate_proposals.pre_nms_top_n);
+    vsi_nn_kernel_param_add_int32( param, "post_nms_top_n", self->nn_param.generate_proposals.post_nms_top_n);
+    vsi_nn_kernel_param_add_float32( param, "iou_threshold", self->nn_param.generate_proposals.iou_threshold );
+    vsi_nn_kernel_param_add_float32( param, "min_size", self->nn_param.generate_proposals.min_size );
+
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "cpu beckend conv2d",
+        inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
+
+    if( self->n )
     {
-        kernel_info.kernel_index = 1;
-        kernel_info.init_index = 1;
-    }
-    else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/
-    {
-        kernel_info.kernel_index = 0;
-        kernel_info.init_index = 0;
+        status = VSI_SUCCESS;
     }
 
-    self->n = vsi_nn_RegisterClientKernelAndNewNode(
-            self->graph, &kernel_info);
-    if (kernel_info.resource_name)
-    {
-        free(kernel_info.resource_name);
-    }
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-    if (NULL != op_compute_list[kernel_info.init_index])
-    {
-        status = op_compute_list[kernel_info.init_index](self, inputs, outputs);
-    }
+    vsi_nn_kernel_param_release( &param );
     return status;
 } /* op_compute() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c
new file mode 100644
index 0000000..d979662
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c
@@ -0,0 +1,207 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "vsi_nn_internal_node.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+#define _INPUT_NUM          (3)
+#define _OUTPUT_NUM         (1)
+
+static vsi_nn_tensor_t * _expand_tensor_dim
+    ( vsi_nn_graph_t * graph, vsi_nn_tensor_t *tensor, uint32_t * shape, size_t rank, int32_t expand_dim )
+{
+    uint32_t new_shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+    uint32_t i, cnt;
+    if ( expand_dim < 0 )
+    {
+        expand_dim = (int32_t)rank + expand_dim;
+    }
+    if ( expand_dim < 0 || (uint32_t)expand_dim > rank )
+    {
+        VSILOGE("Run dim to expand %d, rank is %lu", expand_dim, rank);
+        return NULL;
+    }
+    for ( i = 0, cnt = 0; i < rank; i ++ )
+    {
+        if ( i == (uint32_t)expand_dim )
+        {
+            new_shape[cnt] = 1;
+            cnt ++;
+        }
+        new_shape[cnt] = shape[i];
+        cnt ++;
+    }
+    if ( (uint32_t)expand_dim == rank )
+    {
+        new_shape[cnt] = 1;
+    }
+
+    return vsi_nn_reshape_tensor( graph, tensor, new_shape, (uint32_t)rank + 1 );
+} /* _expand_tensor_dim() */
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    return vsi_nn_internal_compute_node( self );
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_bool ret = FALSE;
+
+    ret = vsi_nn_OpCheck(VSI_NN_OP_CONV2D, self, inputs, outputs);
+
+    return ret;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_internal_node_t* curr = NULL;
+    vsi_nn_grouped_conv1d_param* p = &self->nn_param.grouped_conv1d;
+
+    vsi_nn_internal_init_node_wksp(self);
+
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize
+            (
+            inputs[0]->attr.size[0],
+            inputs[1]->attr.size[0],
+            p->pad,
+            p->stride,
+            p->dilation,
+            VSI_NN_ROUND_FLOOR
+            );
+
+        outputs[0]->attr.size[1] = inputs[1]->attr.size[2];
+        outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
+        outputs[0]->attr.dim_num = 3;
+    }
+
+    p->local->input = _expand_tensor_dim( self->graph, inputs[0],
+            inputs[0]->attr.size, inputs[0]->attr.dim_num, 0 );
+    p->local->weight = _expand_tensor_dim( self->graph, inputs[1],
+            inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 );
+    p->local->output = _expand_tensor_dim( self->graph, outputs[0],
+            outputs[0]->attr.size, outputs[0]->attr.dim_num, 0 );
+
+
+    curr = vsi_nn_internal_new_node(self, VSI_NN_OP_GROUPED_CONV2D, 0, 0);
+    curr->inputs[0] = p->local->input;
+    curr->inputs[1] = p->local->weight;
+    curr->inputs[2] = inputs[2];
+    curr->outputs[0] = p->local->output;
+    curr->node->nn_param.grouped_conv2d.ksize[0] = 1;
+    curr->node->nn_param.grouped_conv2d.ksize[1] = p->ksize;
+    curr->node->nn_param.grouped_conv2d.dilation[0] = 1;
+    curr->node->nn_param.grouped_conv2d.dilation[1] = p->dilation;
+    curr->node->nn_param.grouped_conv2d.pad[0] = 0;
+    curr->node->nn_param.grouped_conv2d.pad[1] = p->pad[0];
+    curr->node->nn_param.grouped_conv2d.pad[2] = 0;
+    curr->node->nn_param.grouped_conv2d.pad[3] = p->pad[1];
+    curr->node->nn_param.grouped_conv2d.stride[0] = 1;
+    curr->node->nn_param.grouped_conv2d.stride[1] = p->stride;
+    curr->node->nn_param.grouped_conv2d.group = p->group;
+    curr->node->nn_param.grouped_conv2d.multiplier = p->multiplier;
+    curr->node->nn_param.grouped_conv2d.weights = p->weights;
+    curr->node->nn_param.grouped_conv2d.pad_type = p->pad_type;
+
+    vsi_nn_internal_setup_node(self, curr);
+
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t* self
+    )
+{
+
+    self->nn_param.grouped_conv1d.local = (grouped_conv1d_local_data_t *)malloc(sizeof(grouped_conv1d_local_data_t));
+    memset(self->nn_param.grouped_conv1d.local, 0x00, sizeof(grouped_conv1d_local_data_t));
+
+    return VSI_SUCCESS;
+} /* op_init() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t* self
+    )
+{
+    vsi_nn_grouped_conv1d_param* p = &self->nn_param.grouped_conv1d;
+    vsi_nn_internal_deinit_node_wksp(self);
+
+    vsi_safe_release_tensor(p->local->input);
+    vsi_safe_release_tensor(p->local->weight);
+    vsi_safe_release_tensor(p->local->output);
+    vsi_nn_safe_free(p->local);
+
+    return vsi_nn_op_common_deinit(self);
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ GROUPED_CONV1D,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c
index 30e6f94..32b62af 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c
@@ -36,7 +36,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_dtype_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
 #define _ARG_NUM            (1)
@@ -207,7 +207,11 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
-    return vsi_nn_OpCheck(VSI_NN_OP_CONV2D, self, inputs, outputs);
+    vsi_bool ret = FALSE;
+
+    ret = vsi_nn_OpCheck(VSI_NN_OP_CONV2D, self, inputs, outputs);
+
+    return ret;
 } /* op_check() */
 
 static vsi_bool op_setup
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
new file mode 100644
index 0000000..a217600
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
@@ -0,0 +1,297 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_platform.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_node.h"
+#include "utils/vsi_nn_math.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_log.h"
+#include "libnnext/vsi_nn_vxkernel.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+#define _INPUT_NUM          (3)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status _try_set_high_presision_tensor
+    (
+    vsi_nn_tensor_t **inputs
+    )
+{
+    vsi_status status;
+    vsi_nn_vxtensor_attr_t attr;
+
+    status = VSI_SUCCESS;
+    attr = VSI_NN_TENSOR_ATTR_HIGH_PRECISION;
+
+    if (VSI_NN_TYPE_FLOAT32 == inputs[1]->attr.dtype.vx_type)
+    {
+        status = vsi_nn_SetTensorAttr(inputs[1], attr);
+        if (VSI_SUCCESS != status)
+        {
+            return status;
+        }
+    }
+    if (VSI_NN_TYPE_FLOAT32 == inputs[2]->attr.dtype.vx_type)
+    {
+        status = vsi_nn_SetTensorAttr(inputs[2], attr);
+        if (VSI_SUCCESS != status)
+        {
+            return status;
+        }
+    }
+
+    return status;
+}
+
+static vsi_bool _is_3d_group_norm
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs
+    )
+{
+    if ( 3 == inputs[0]->attr.dim_num )
+    {
+        return TRUE;
+    }
+    return FALSE;
+} /* _is_3d_group_norm() */
+
+static vsi_status _op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t * param = NULL;
+    vsi_nn_kernel_node_t n = NULL;
+    float eps = self->nn_param.groupnorm.eps;
+    int32_t group_num = self->nn_param.groupnorm.group_num;
+    vsi_nn_tensor_t * tmp_inputs[3]  = {NULL, NULL, NULL};
+    vsi_nn_tensor_t * tmp_outputs[1] = {NULL};
+    vsi_nn_groupnorm_lcl_data *local = self->nn_param.groupnorm.lcl_data;
+
+    status = _try_set_high_presision_tensor(inputs);
+    if (status != VSI_SUCCESS)
+    {
+        VSILOGE("Set tensor attr of high presision fail");
+        return status;
+    }
+
+    if (_is_3d_group_norm(self, inputs))
+    {
+        tmp_inputs[0]  = local->reshaped_input;
+        tmp_outputs[0] = local->reshaped_output;
+        tmp_inputs[1] = inputs[1];
+        tmp_inputs[2] = inputs[2];
+    }
+    else
+    {
+        tmp_inputs[0] = inputs[0];
+        tmp_outputs[0] = outputs[0];
+        tmp_inputs[1] = inputs[1];
+        tmp_inputs[2] = inputs[2];
+    }
+
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_float32( param, "eps", eps );
+    vsi_nn_kernel_param_add_int32( param, "group_num", group_num );
+    n = vsi_nn_kernel_selector( self->graph, "group_norm",
+                    tmp_inputs, _INPUT_NUM, tmp_outputs, _OUTPUT_NUM, param );
+    if ( n != NULL )
+    {
+        self->n = (vx_node)n;
+        status = VSI_SUCCESS;
+    }
+
+    if (param != NULL)
+    {
+        vsi_nn_kernel_param_release( &param );
+    }
+
+    return status;
+} /* op_compute() */
+
+static vsi_status _op_optimize
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    vsi_nn_opt_direction_e direction
+    )
+{
+    uint32_t dim = 0;
+    vsi_nn_groupnorm_lcl_data* local = NULL;
+    uint32_t shape[VSI_NN_MAX_DIM_NUM];
+    char tensor_name[128];
+
+    if (_is_3d_group_norm(self, inputs) == FALSE)
+    {
+        return VSI_SUCCESS;
+    }
+
+    VSILOGD("Optimize 3D %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
+    /*
+        insert a reshape node before and after 3D group_norm
+    */
+    shape[0] = 1;
+    shape[1] = inputs[0]->attr.size[0];
+    shape[2] = inputs[0]->attr.size[1];
+    shape[3] = inputs[0]->attr.size[2];
+    dim = 4;
+    local = self->nn_param.groupnorm.lcl_data;
+    if (VSI_NN_OPTIMIZE_FORWARD == direction)
+    {
+        /* reshape 3d input (xcn) --> 4d input (whcn) */
+        local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, dim);
+    }
+    else
+    {
+        /* reshape 3d output(xcn) --> 4d output(whcn) */
+        local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, dim);
+        if (local->reshaped_output && local->reshaped_output->t)
+        {
+            memset(tensor_name, 0, sizeof(tensor_name));
+            snprintf(tensor_name, sizeof(tensor_name), "uid_%u_reshape_out_0", self->uid);
+            if (vxSetReferenceName((vx_reference)local->reshaped_output->t, tensor_name) == VSI_FAILURE)
+            {
+                VSILOGW("Set uid %u groupnorm reshaped output name fail", self->uid);
+                return VSI_FAILURE;
+            }
+        }
+    }
+
+    return VSI_SUCCESS;
+} /* op_optimize() */
+
+static vsi_bool _op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(GROUP_NORM, 3, 1)
+        IO_TYPE(D_F16,  D_F32,  D_F16,  D_F16)
+        IO_TYPE(D_F16,  D_F32,  D_F16,  D_U8|Q_ASYM)
+        IO_TYPE(D_F32,  D_F32,  D_F16,  D_F32)
+        IO_TYPE(D_F32,  D_F32,  D_F32,  D_F32)
+        IO_TYPE(D_I32,  D_F32,  D_F16,  D_I32)
+        IO_TYPE(D_I32,  D_F32,  D_F16,  D_F32)
+        IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_F16)
+        IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_I8|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F16)
+        IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_F16)
+        IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_I16|Q_DFP)
+    END_IO_TYPE_DECL(GROUP_NORM)
+    if (!VALIDATE_OP_IO_TYPES(GROUP_NORM, self, inputs, self->input.num, outputs, self->output.num))
+    {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_status _op_init
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    self->nn_param.groupnorm.lcl_data =
+    (vsi_nn_groupnorm_lcl_data *)malloc(sizeof(vsi_nn_groupnorm_lcl_data));
+    if (NULL == self->nn_param.groupnorm.lcl_data)
+    {
+        return  VX_ERROR_NO_MEMORY;
+    }
+
+    memset( self->nn_param.groupnorm.lcl_data, 0, sizeof(vsi_nn_groupnorm_lcl_data) );
+
+    self->nn_param.groupnorm.lcl_data->reshaped_input = NULL;
+    self->nn_param.groupnorm.lcl_data->reshaped_output = NULL;
+
+    return status;
+} /* op_init() */
+
+static vsi_status _op_deinit
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_nn_groupnormalize_param *p = &(self->nn_param.groupnorm);
+    if (p->lcl_data->reshaped_input)
+    {
+        vsi_nn_ReleaseTensor(&(p->lcl_data->reshaped_input));
+        p->lcl_data->reshaped_input = NULL;
+    }
+    if (p->lcl_data->reshaped_output)
+    {
+        vsi_nn_ReleaseTensor(&(p->lcl_data->reshaped_output));
+        p->lcl_data->reshaped_output = NULL;
+    }
+    if (self->nn_param.groupnorm.lcl_data)
+    {
+        free(self->nn_param.groupnorm.lcl_data);
+        self->nn_param.groupnorm.lcl_data = NULL;
+    }
+    vsi_nn_op_common_deinit(self);
+
+    return VSI_SUCCESS;
+} /* op_deinit() */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ GROUP_NORM,
+    /* init       */ _op_init,
+    /* compute    */ _op_compute,
+    /* deinit     */ _op_deinit,
+    /* check      */ _op_check,
+    /* setup      */ vsi_nn_op_common_setup,
+    /* optimize   */ _op_optimize,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c
index 82ad745..d4ac7a2 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "vsi_nn_internal_node.h"
 #include "vsi_nn_rnn_helper.h"
 
@@ -265,6 +265,17 @@ static vsi_bool op_setup_default
         curr->node->nn_param.grucell_ovxlib.activation = curr_param->activation;
         curr->node->nn_param.grucell_ovxlib.recurrent_activation = curr_param->recurrent_activation;
         curr->node->nn_param.grucell_ovxlib.linear_before_reset = curr_param->linear_before_reset;
+        if ( reshape_output->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 )
+        {
+            int32_t k = 0;
+            for (k = 0; k < sizeof( curr_param->internal_dtype ) / sizeof(curr_param->internal_dtype[0]); k++)
+            {
+                if (curr_param->internal_dtype[k].vx_type == VSI_NN_TYPE_NONE)
+                {
+                    curr_param->internal_dtype[k].vx_type = VSI_NN_TYPE_BFLOAT16;
+                }
+            }
+        }
         memcpy( curr->node->nn_param.grucell_ovxlib.internal_dtype,
             curr_param->internal_dtype, sizeof( curr_param->internal_dtype ) );
         curr->node->nn_param.grucell_ovxlib.use_cudnn_implementation = curr_param->use_cudnn_implementation;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c
index d5797d9..c1d60b6 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "ops/vsi_nn_op_grucell_ovxlib.h"
 #include "vsi_nn_internal_node.h"
 #include "vsi_nn_rnn_helper.h"
@@ -236,7 +236,15 @@ static vsi_bool op_setup_float
         use_virtual_tensor, inputs[GRUCELL_INPUT_INPUT], inputs[GRUCELL_INPUT_H_STATE]);
 
     dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
-    dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+    if ( input_hstate->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
+         input_hstate->t->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 )
+    {
+        dtype.vx_type = input_hstate->t->attr.dtype.vx_type;
+    }
+    else
+    {
+        dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+    }
     tmp_tensor = vsi_nn_rnn_create_tp_fc(self, input_hstate->t,
         p->local->weights_z_r, p->local->bias_z_r, &dtype, use_virtual_tensor);
 
@@ -261,7 +269,15 @@ static vsi_bool op_setup_float
             inputs[GRUCELL_INPUT_INPUT], tensor_rt->t);
 
         dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
-        dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+        if ( tmp_tensor->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
+             tmp_tensor->t->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 )
+        {
+            dtype.vx_type = input_hstate->t->attr.dtype.vx_type;
+        }
+        else
+        {
+            dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+        }
         /* W{c} x [x{t}, r{t}] */
         tmp_tensor = vsi_nn_rnn_create_tp_fc(self, tmp_tensor->t, p->local->weights_c, p->local->bias_c,
             &dtype, use_virtual_tensor);
@@ -270,7 +286,15 @@ static vsi_bool op_setup_float
     else
     {
         dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
-        dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+        if ( inputs[GRUCELL_INPUT_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
+             inputs[GRUCELL_INPUT_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 )
+        {
+            dtype.vx_type = inputs[GRUCELL_INPUT_H_STATE]->attr.dtype.vx_type;
+        }
+        else
+        {
+            dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+        }
         /* r.(hstate*w_hc + b_hc) */
         tmp_tensor = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_H_STATE], inputs[GRUCELL_INPUT_WEIGHT_H2C],
             inputs[GRUCELL_INPUT_BIAS_H2C], &dtype, use_virtual_tensor);
@@ -635,12 +659,28 @@ static vsi_bool op_setup_float_cudnn_v2
         use_virtual_tensor, inputs[GRUCELL_INPUT_INPUT], inputs[GRUCELL_INPUT_H_STATE]);
 
     dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
-    dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+    if ( concated_input->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
+         concated_input->t->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 )
+    {
+        dtype.vx_type = concated_input->t->attr.dtype.vx_type;
+    }
+    else
+    {
+        dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+    }
     tmp_tensor = vsi_nn_rnn_create_tp_fc(self, concated_input->t, p->local->weights_input,
         p->local->bias_z_r, &dtype, use_virtual_tensor);
 
     dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
-    dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+    if ( splited_input_fc_output_tensors[0]->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
+         splited_input_fc_output_tensors[0]->t->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 )
+    {
+        dtype.vx_type = splited_input_fc_output_tensors[0]->t->attr.dtype.vx_type;
+    }
+    else
+    {
+        dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+    }
     {
         uint32_t _slices[] = { inputs[GRUCELL_INPUT_INPUT]->attr.size[0],
             inputs[GRUCELL_INPUT_H_STATE]->attr.size[0] };
@@ -651,7 +691,15 @@ static vsi_bool op_setup_float_cudnn_v2
         inputs[GRUCELL_INPUT_WEIGHT_I2C], inputs[GRUCELL_INPUT_BIAS_I2C], &dtype, use_virtual_tensor);
 
     dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
-    dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+    if ( inputs[GRUCELL_INPUT_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
+         inputs[GRUCELL_INPUT_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 )
+    {
+        dtype.vx_type = inputs[GRUCELL_INPUT_H_STATE]->attr.dtype.vx_type;
+    }
+    else
+    {
+        dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+    }
     recurrent2cand_output = vsi_nn_rnn_create_tp_fc(self, inputs[GRUCELL_INPUT_H_STATE],
         inputs[GRUCELL_INPUT_WEIGHT_H2C], inputs[GRUCELL_INPUT_BIAS_H2C], &dtype, use_virtual_tensor);
 
@@ -668,7 +716,15 @@ static vsi_bool op_setup_float_cudnn_v2
     attr.vtl = use_virtual_tensor;
     attr.is_const = FALSE;
     attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
-    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+    if ( splited_input_fc_output_tensors[0]->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
+         splited_input_fc_output_tensors[0]->t->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 )
+    {
+        dtype.vx_type = splited_input_fc_output_tensors[0]->t->attr.dtype.vx_type;
+    }
+    else
+    {
+        dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+    }
     tmp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
 
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_A_TIMES_B_PLUS_C, 0, 0 );
@@ -969,7 +1025,15 @@ static vsi_bool op_setup_default
 
             memcpy(&attr, &(inputs[GRUCELL_INPUT_WEIGHT_H2C]->attr), sizeof(attr));
             attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
-            attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+            if ( rh_mul_outputs->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
+                 rh_mul_outputs->t->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 )
+            {
+                attr.dtype.vx_type = rh_mul_outputs->t->attr.dtype.vx_type;
+            }
+            else
+            {
+                attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+            }
 
             wei_r2c_tensor = vsi_nn_ConvertTensorDtype(self->graph, inputs[GRUCELL_INPUT_WEIGHT_H2C], &(attr.dtype));
             rh_cand_fc_output = vsi_nn_rnn_create_tp_fc(self,
@@ -1049,7 +1113,16 @@ static vsi_bool op_setup_default
     attr.vtl = use_virtual_tensor;
     attr.is_const = FALSE;
     attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
-    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+    if ( input_tensor->t->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
+         input_tensor->t->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 )
+    {
+        attr.dtype.vx_type = input_tensor->t->attr.dtype.vx_type;
+    }
+    else
+    {
+        attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+    }
+
     tmp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
 
     /* create internal tensor sub node (1-zt)*c */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c b/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c
index d588b12..0a01f72 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 #define _ARG_NUM            (1)
 #define _INPUT_NUM          (2)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c b/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c
index cc8103c..8883c35 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
 #define _ARG_NUM            (14)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
index 2a0f6a2..8f54a50 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
@@ -211,16 +211,23 @@ static vsi_bool op_check
 {
     BEGIN_IO_TYPE_DECL(INSTANCE_NORM, 3, 1)
         IO_TYPE(D_F16,  D_F32,  D_F16,  D_F16)
+        IO_TYPE(D_F16,  D_F32,  D_F32,  D_F16)
+        IO_TYPE(D_F16,  D_F16,  D_F16,  D_F16)
         IO_TYPE(D_F32,  D_F32,  D_F16,  D_F32)
+        IO_TYPE(D_F32,  D_F16,  D_F16,  D_F32)
         IO_TYPE(D_F32,  D_F32,  D_F32,  D_F32)
         IO_TYPE(D_I32,  D_F32,  D_F16,  D_I32)
         IO_TYPE(D_I32,  D_F32,  D_F16,  D_F32)
+        IO_TYPE(D_BF16, D_F32,  D_F32,  D_BF16)
         IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_F16)
         IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_I8|Q_DFP)
         IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F16)
         IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_U8|Q_ASYM)
         IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_F16)
         IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_I16|Q_DFP)
     END_IO_TYPE_DECL(INSTANCE_NORM)
     if (!VALIDATE_OP_IO_TYPES(INSTANCE_NORM, self, inputs, self->input.num, outputs, self->output.num))
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c b/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c
index cc38677..e11ba4e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c
@@ -174,6 +174,7 @@ static vsi_bool op_setup
         curr->node->nn_param.strided_slice.begin_mask = 0;
         curr->node->nn_param.strided_slice.end_mask = 0;
         curr->node->nn_param.strided_slice.shrink_axis_mask = 0;
+        curr->node->nn_param.strided_slice.new_axis_mask = 0;
         begin_dims = (int32_t *)vsi_nn_internal_new_node_param(curr,
             VSI_NN_MAX_DIM_NUM * sizeof(uint32_t));
         end_dims   = (int32_t *)vsi_nn_internal_new_node_param(curr,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
index 04e5610..4057e45 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
@@ -34,11 +34,11 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "client/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "utils/vsi_nn_dtype_util.h"
 
 #define _INPUT_NUM          (2)
 #define _OUTPUT_NUM         (1)
@@ -84,7 +84,8 @@ static vsi_nn_tensor_t* _expand_scale_tensor
     attr.size[0] = scale_size_out;
     attr.size[1] = 1;
     attr.dim_num = 2;
-    attr.dtype.vx_type = scale->attr.dtype.vx_type;
+    out_dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
     attr.vtl = FALSE;
     scale_tensor = vsi_nn_CreateTensor(graph, &attr);
     out_dtype          = scale->attr.dtype;
@@ -154,6 +155,65 @@ static vsi_bool _check_value_is_equal_to_one
     return ret;
 }
 
+static vsi_bool _tensor_data_convert
+    (
+    vsi_nn_graph_t* graph,
+    vsi_nn_tensor_t* in_tensor,
+    vsi_nn_tensor_t* out_tensor
+    )
+{
+    vsi_bool ret = TRUE;
+    float* tensor_data = NULL;
+    uint32_t size = 0;
+    uint32_t stride[VSI_NN_MAX_DIM_NUM] = { 0 };
+    uint8_t* data = NULL;
+
+    tensor_data = vsi_nn_ConvertTensorToFloat32Data( graph, in_tensor );
+    if ( NULL == tensor_data )
+    {
+        VSILOGE( "Convert data fail." );
+        return FALSE;
+    }
+
+    size = vsi_nn_GetStrideSize( &out_tensor->attr, stride );
+    data = (uint8_t *)malloc( size );
+
+    if ( data )
+    {
+        uint32_t i = 0;
+        uint32_t elements = size / stride[0];
+        vsi_status status = VSI_SUCCESS;
+
+        for ( i = 0; i < elements; i ++ )
+        {
+            status = vsi_nn_Float32ToDtype( tensor_data[i], &data[stride[0] * i], &out_tensor->attr.dtype );
+            if( VSI_FAILURE == status )
+            {
+                VSILOGE("Convert default_value to dtype fail");
+                break;
+            }
+        }
+
+        status = vsi_nn_CopyDataToTensor( graph, out_tensor, data );
+        free( data );
+        data = NULL;
+        if ( VSI_FAILURE == status )
+        {
+            VSILOGE("Copy data to tensor fail");
+        }
+    }
+
+    if ( !in_tensor->attr.is_created_from_handle )
+    {
+        if ( tensor_data )
+        {
+            free(tensor_data);
+        }
+    }
+
+    return ret;
+}
+
 static vsi_status op_compute
     (
     vsi_nn_node_t * self,
@@ -180,7 +240,10 @@ static vsi_status op_compute
     p = &(self->nn_param.l2normalizescale);
     axis = p->axis;
 
-    if ( inputs[1]->attr.is_const == TRUE && _check_value_is_equal_to_one(self->graph, inputs[1]) )
+    if ( (inputs[1]->attr.is_const == TRUE && _check_value_is_equal_to_one(self->graph, inputs[1])) ||
+        ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 &&
+          outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 )
+        )
     {
         return vsi_nn_internal_compute_node( self );
     }
@@ -203,10 +266,10 @@ static vsi_status op_compute
     shapes[1][2] = 1;
     shapes[1][3] = 1;
     scale_size = shapes[0][new_axis];
-    is_expand_scale = (vx_bool)((size < scale_size) && (TRUE == inputs[1]->attr.is_const));
+    is_expand_scale = (vx_bool)(TRUE == inputs[1]->attr.is_const);
     vsi_nn_kernel_param_add_int32( param, "axis",  new_axis );
 
-    if( ret )
+    if ( ret )
     {
         reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
                 inputs[0], (uint32_t*)shapes[0], rank_in );
@@ -249,13 +312,20 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(L2NORMALIZESCALE, _INPUT_NUM, _OUTPUT_NUM)
-        IO_TYPE(D_F16, D_F16, D_F16)
-        IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP, D_F16, D_F16)
+        IO_TYPE(D_F16,  D_F16,  D_F16)
+        IO_TYPE(D_F16,  D_F32,  D_F16)
+        IO_TYPE(D_BF16, D_BF16, D_BF16)
+        IO_TYPE(D_BF16, D_F32,  D_BF16)
+        IO_TYPE(D_I8|Q_DFP,  D_F16, D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,  D_F16, D_F16)
+        IO_TYPE(D_I8|Q_DFP,  D_F32, D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,  D_F32, D_F16)
         IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16)
         IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP)
         IO_TYPE(D_I16|Q_DFP, D_F16, D_F16)
+        IO_TYPE(D_I16|Q_DFP, D_F32, D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP, D_F32, D_F16)
         IO_TYPE(D_F32, D_F32, D_F32)
         IO_TYPE(D_U8|Q_ASYM, D_F32, D_U8|Q_ASYM)
     END_IO_TYPE_DECL(L2NORMALIZESCALE)
@@ -328,8 +398,53 @@ static vsi_bool op_setup
         curr->outputs[0] = outputs[0];
         vsi_nn_internal_setup_node( self, curr );
     }
+    else if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 &&
+        outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 )
+    {
+        vsi_nn_internal_tensor_t* output_tensor = NULL;
+        vsi_nn_internal_tensor_t* reshape_tensor = NULL;
+        vsi_nn_tensor_attr_t attr;
+        int32_t dim_num = inputs[0]->attr.dim_num;
+        int32_t i = 0;
 
-    ret = vsi_nn_op_common_setup(self, inputs, outputs);
+        memcpy( &attr, &outputs[0]->attr, sizeof( attr ) );
+        attr.vtl = TRUE;
+        attr.is_const = FALSE;
+        output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+
+        curr = vsi_nn_internal_new_node(self, VSI_NN_OP_L2_NORMALIZE, 0, 0);
+        curr->node->nn_param.l2_normalize.axis = self->nn_param.l2normalizescale.axis;
+        curr->inputs[0] = inputs[0];
+        curr->outputs[0] = output_tensor->t;
+        vsi_nn_internal_setup_node( self, curr );
+
+        memcpy( &attr, &inputs[1]->attr, sizeof( attr ) );
+        for (i = 0; i < dim_num; i++)
+        {
+            attr.size[i] = i == self->nn_param.l2normalizescale.axis ? inputs[0]->attr.size[i] : 1;
+        }
+        attr.dim_num = dim_num;
+        if (attr.dtype.vx_type != VSI_NN_TYPE_BFLOAT16)
+        {
+            attr.dtype.vx_type = VSI_NN_TYPE_BFLOAT16;
+            attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+        }
+        reshape_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+        _tensor_data_convert(self->graph, inputs[1], reshape_tensor->t);
+
+        curr = vsi_nn_internal_new_node(self, VSI_NN_OP_MULTIPLY, 0, 0);
+        curr->inputs[0] = output_tensor->t;
+        curr->inputs[1] = reshape_tensor->t;
+        curr->node->nn_param.multiply.scale = 1.0f;
+        curr->node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
+        curr->node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_NEAREST_EVEN;
+        curr->outputs[0] = outputs[0];
+        vsi_nn_internal_setup_node( self, curr );
+    }
+    else
+    {
+        ret = vsi_nn_op_common_setup(self, inputs, outputs);
+    }
 
     return ret;
 }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
index 7cc8663..be46f09 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
@@ -51,28 +51,11 @@ static vsi_status op_compute
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_param_t * param = NULL;
     vsi_nn_kernel_node_t    n = NULL;
-    float eps = self->nn_param.instancenorm.eps;
-    uint32_t *input_size = inputs[0]->attr.size;
-    uint32_t dims_num = inputs[0]->attr.dim_num;
-    int32_t rs_flg = 0;
-    int32_t wh_flg = 0;
+    float eps = self->nn_param.layernorm.eps;
 
-    param =vsi_nn_kernel_param_create();
-
-    if (input_size[0] >= GPU_TENSOR_MAX_WIDTH)
-    {
-        wh_flg = 1;
-    }
-
-    if ((input_size[1] * input_size[2] < GPU_TENSOR_MAX_WIDTH)
-        && dims_num > 2)
-    {
-        rs_flg = 1;
-    }
+    param = vsi_nn_kernel_param_create();
 
     vsi_nn_kernel_param_add_float32( param, "eps", eps );
-    vsi_nn_kernel_param_add_int32( param, "reshape_flg", rs_flg );
-    vsi_nn_kernel_param_add_int32( param, "wh_flg", wh_flg );
     n = vsi_nn_kernel_selector( self->graph, "layer_norm",
                     inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
     if ( n != NULL )
@@ -99,10 +82,14 @@ static vsi_bool op_check
     BEGIN_IO_TYPE_DECL(LAYER_NORM, 3, 1)
         IO_TYPE(D_F32,  D_F32,  D_F32,  D_F32)
         IO_TYPE(D_F16,  D_F32,  D_F16,  D_F16)
+        IO_TYPE(D_F16,  D_F32,  D_F32,  D_F16)
         IO_TYPE(D_F16,  D_F32,  D_F16,  D_U8|Q_ASYM)
+        IO_TYPE(D_BF16, D_F32,  D_F32,  D_BF16)
         IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F16)
         IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_U8|Q_ASYM)
         IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_I16|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_I16|Q_DFP)
     END_IO_TYPE_DECL(LAYER_NORM)
     if (!VALIDATE_OP_IO_TYPES(LAYER_NORM, self, inputs, self->input.num, outputs, self->output.num))
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c
index 899711c..5910a92 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "vsi_nn_internal_node.h"
 #include "vsi_nn_rnn_helper.h"
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
index d95f48d..3db70e8 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_util.h"
 
@@ -153,8 +153,7 @@ static vsi_bool op_setup
             memcpy(&attr, &(inputs[LSTMUNIT_ACT_DATA_BI + i]->attr), sizeof(vsi_nn_tensor_attr_t));
             attr.size[1] = 1;
             attr.dim_num = 2;
-            t0 = vsi_nn_CreateTensor( self->graph, &attr );
-            vsi_nn_ReshapeTensor(self->graph, inputs[LSTMUNIT_ACT_DATA_BI + i], t0, attr.size, attr.dim_num);
+            t0 = vsi_nn_reshape_tensor(self->graph, inputs[LSTMUNIT_ACT_DATA_BI + i], attr.size, attr.dim_num);
 
             if( dst_dtype.vx_type != t0->attr.dtype.vx_type
                 && dst_dtype.qnt_type != t0->attr.dtype.qnt_type )
@@ -176,8 +175,7 @@ static vsi_bool op_setup
             memcpy(&attr, &(inputs[LSTMUNIT_ACT_LN_WI + i]->attr), sizeof(vsi_nn_tensor_attr_t));
             attr.size[1] = 1;
             attr.dim_num = 2;
-            t1 = vsi_nn_CreateTensor( self->graph, &attr );
-            vsi_nn_ReshapeTensor(self->graph, inputs[LSTMUNIT_ACT_LN_WI + i], t1, attr.size, attr.dim_num);
+            t1 = vsi_nn_reshape_tensor(self->graph, inputs[LSTMUNIT_ACT_LN_WI + i], attr.size, attr.dim_num);
 
             if( dst_dtype.vx_type != t1->attr.dtype.vx_type
                 && dst_dtype.qnt_type != t1->attr.dtype.qnt_type )
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c
index 07b6ca2..f57eddb 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "ops/vsi_nn_op_lstmunit_ovxlib.h"
 #include "vsi_nn_internal_node.h"
 #include "vsi_nn_rnn_helper.h"
@@ -307,7 +307,7 @@ static vsi_bool op_setup
     p->local->use_cifg = ( NULL == inputs[LSTMUNIT_INPUT_WEIGHT_I2I] );
     p->local->use_layer_norm = ( NULL != inputs[LSTMUNIT_INPUT_LAYERNORM_F] );
     p->local->use_projection = ( NULL != inputs[LSTMUNIT_INPUT_WEIGHT_PROJ] );
-    p->local->use_projection_bias = FALSE;//NULL != inputs[19];
+    p->local->use_projection_bias = ( NULL != inputs[LSTMUNIT_INPUT_BIAS_PROJ] );
     p->local->multi_batch = ( inputs[LSTMUNIT_INPUT_INPUT]->attr.size[1] > 1 );
     p->local->use_peephole = ( NULL != inputs[LSTMUNIT_INPUT_WEIGHT_C2O] );
     ifco_start_index = p->local->use_cifg ? 1 : 0;
@@ -621,8 +621,6 @@ static vsi_bool op_setup
         curr->inputs[1] = inputs[LSTMUNIT_INPUT_WEIGHT_PROJ];
         curr->inputs[2] = zero_bias_tensor;
 
-        tmp_tensor = output_tensor;
-
         /* Save output to h_state first and copy to output */
         if( p->local->use_hybrid && p->local->use_projection_bias )
         {
@@ -636,6 +634,8 @@ static vsi_bool op_setup
             curr->outputs[0] = outputs[LSTMUNIT_OUTPUT_H_STATE];
         }
 
+        tmp_tensor = output_tensor;
+
         vsi_nn_internal_setup_node(self, curr);
 
         if( p->local->use_hybrid && p->local->use_projection_bias )
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c
index f4b8efe..eaeaaa5 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c
@@ -106,6 +106,9 @@ static vsi_bool op_check
         IO_TYPE(D_F16,  D_I16|Q_DFP,  D_I16|Q_DFP)
         IO_TYPE(D_F16,  D_I8|Q_DFP,   D_F16)
         IO_TYPE(D_F16,  D_U8|Q_ASYM,  D_U8|Q_ASYM)
+        IO_TYPE(D_F16,  D_I8|Q_DFP,   D_I8|Q_DFP)
+        IO_TYPE(D_F16,  D_U8,   D_U8)
+        IO_TYPE(D_F16,  D_I8,   D_I8)
         IO_TYPE(D_F16,  D_F16,  D_F16)
         IO_TYPE(D_F32,  D_F32,  D_F32)
         IO_TYPE(D_F32,  D_I8|Q_DFP,  D_F32)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c b/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c
index 985f2da..fbcdd0b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c
@@ -37,6 +37,31 @@
 #define _INPUT_NUM    1
 #define _OUTPUT_NUM   2
 
+static void _squeeze_axis
+    (
+    vsi_nn_tensor_t *input,
+    const int32_t* axis_in,
+    int32_t axis_num,
+    int32_t* axis_out,
+    int32_t *axis_num_out
+    )
+{
+    int32_t i = 0;
+
+    memcpy(axis_out, axis_in, sizeof(int32_t) * axis_num);
+    *axis_num_out = axis_num;
+
+    for (i = 0; i < axis_num; i++)
+    {
+        if (axis_in[i] == 3 && input->attr.size[3] == 1)
+        {
+            *axis_num_out = axis_num - 1;
+            axis_out[i] = 0;
+            break;
+        }
+    }
+}
+
 static vsi_status op_compute
     (
     vsi_nn_node_t * self,
@@ -47,22 +72,25 @@ static vsi_status op_compute
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_param_t * param = NULL;
     vsi_nn_kernel_node_t    n = NULL;
-    int32_t* axis = self->nn_param.moments.axis;
+    int32_t axes_copy[VSI_NN_MAX_DIM_NUM] = { 0 };
+    const int32_t* axis = self->nn_param.moments.axis;
     int32_t axis_num = self->nn_param.moments.axis_num;
     int32_t keep_dim = self->nn_param.moments.keep_dim ? 1 : 0;
 
-    param =vsi_nn_kernel_param_create();
+    _squeeze_axis(inputs[0], axis, axis_num, axes_copy, &axis_num);
 
-    vsi_nn_kernel_param_add_buffer( param, "axis", axis, axis_num);
+    param = vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_buffer( param, "axis", axes_copy, axis_num);
     vsi_nn_kernel_param_add_int32( param, "keep_dim", keep_dim);
     n = vsi_nn_kernel_selector( self->graph, "moments", inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
-    if( n != NULL )
+    if (n != NULL)
     {
         self->n = (vx_node)n;
         status = VSI_SUCCESS;
     }
 
-    if(param != NULL)
+    if (param != NULL)
     {
         vsi_nn_kernel_param_release( &param );
     }
@@ -77,6 +105,10 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    int32_t axes_copy[VSI_NN_MAX_DIM_NUM] = { 0 };
+    int32_t axes_num = 0;
+    int32_t i = 0;
+
     BEGIN_IO_TYPE_DECL(MOMENTS, 1, 2)
         IO_TYPE(D_U8|Q_ASYM,  D_F16,  D_F16)
         IO_TYPE(D_I8|Q_DFP,   D_F16,  D_F16)
@@ -85,7 +117,7 @@ static vsi_bool op_check
         IO_TYPE(D_F32,   D_F32,  D_F32)
         IO_TYPE(D_I32,   D_F32,  D_F32)
     END_IO_TYPE_DECL(MOMENTS)
-    if(!VALIDATE_OP_IO_TYPES(MOMENTS, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(MOMENTS, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
@@ -93,6 +125,18 @@ static vsi_bool op_check
         return FALSE;
     }
 
+    _squeeze_axis(inputs[0], self->nn_param.moments.axis,
+        self->nn_param.moments.axis_num, axes_copy, &axes_num);
+
+    for (i = 0; i < axes_num; i++)
+    {
+        if (axes_copy[i] > 2)
+        {
+            VSILOGE("moments shader path not support axis: %d", axes_copy[i]);
+            return FALSE;
+        }
+    }
+
     return TRUE;
 } /* op_check() */
 
@@ -107,23 +151,15 @@ static vsi_bool op_setup
     int32_t i = 0, j = 0;
     vsi_nn_moments_param * p = NULL;
 
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num)
     {
-        int32_t* axis = NULL;
+        const int32_t* axis = NULL;
         int32_t axis_num = 0;
         p = &(self->nn_param.moments);
         axis = p->axis;
         axis_num = p->axis_num;
 
-        for(i = 0; i < axis_num; i++)
-        {
-            if(axis[i] > 2)
-            {
-                return FALSE;
-            }
-        }
-
-        if(p->keep_dim)
+        if (p->keep_dim)
         {
             outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
             outputs[1]->attr.dim_num = inputs[0]->attr.dim_num;
@@ -133,45 +169,35 @@ static vsi_bool op_setup
                 outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
                 outputs[1]->attr.size[i] = inputs[0]->attr.size[i];
             }
-            switch(axis_num)
+
+            for (i = 0; i < axis_num; i++)
             {
-            case 1:
-                outputs[0]->attr.size[axis[0]] = 1;
-                outputs[1]->attr.size[axis[0]] = 1;
-                break;
-            case 2:
-                outputs[0]->attr.size[axis[0]] = 1;
-                outputs[0]->attr.size[axis[1]] = 1;
-                outputs[1]->attr.size[axis[0]] = 1;
-                outputs[1]->attr.size[axis[1]] = 1;
-                break;
-            case 3:
-                outputs[0]->attr.size[axis[0]] = 1;
-                outputs[0]->attr.size[axis[1]] = 1;
-                outputs[0]->attr.size[axis[2]] = 1;
-                outputs[1]->attr.size[axis[0]] = 1;
-                outputs[1]->attr.size[axis[1]] = 1;
-                outputs[1]->attr.size[axis[2]] = 1;
-                break;
-            default:
-                return FALSE;
+                outputs[0]->attr.size[axis[i]] = 1;
+                outputs[1]->attr.size[axis[i]] = 1;
             }
         }
         else
         {
+            int32_t idx = 0;
+
             outputs[0]->attr.dim_num = inputs[0]->attr.dim_num - axis_num;
             outputs[1]->attr.dim_num = inputs[0]->attr.dim_num - axis_num;
 
-            for (i = 0; i < axis[0]; i++)
+            for (i = 0; i < (int32_t)inputs[0]->attr.dim_num; i++)
             {
-                outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
-                outputs[1]->attr.size[i] = inputs[0]->attr.size[i];
-            }
+                for (j = 0; j < axis_num; j++)
+                {
+                    if ( i == axis[j] )
+                    {
+                        break;
+                    }
+                }
 
-            for (j = axis[0] + axis_num; j < (int32_t)inputs[0]->attr.dim_num; j++)
-            {
-                outputs[0]->attr.size[i] = inputs[0]->attr.size[j];
-                outputs[1]->attr.size[i++] = inputs[0]->attr.size[j];
+                if (j == axis_num)
+                {
+                    outputs[0]->attr.size[idx] = inputs[0]->attr.size[i];
+                    outputs[1]->attr.size[idx++] = inputs[0]->attr.size[i];
+                }
             }
         }
     }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_nms.c b/src/tim/vx/internal/src/ops/vsi_nn_op_nms.c
new file mode 100644
index 0000000..b5e8f4f
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_nms.c
@@ -0,0 +1,136 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+typedef struct _nms_local_data_t {
+    int32_t placeholder;
+} nms_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (2)
+#define _OUTPUT_NUM         (3)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_nms_param * p = &(self->nn_param.nms);
+    vsi_nn_kernel_param_t * param = NULL;
+
+    param = vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_int32( param, "max_output_size", p->max_output_size );
+    vsi_nn_kernel_param_add_float32( param, "iou_threshold", p->iou_threshold );
+    vsi_nn_kernel_param_add_float32( param, "score_threshold", p->score_threshold );
+    vsi_nn_kernel_param_add_float32( param, "soft_nms_sigma", p->soft_nms_sigma );
+
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+        "nms",
+        inputs, _INPUT_NUM,
+        outputs, _OUTPUT_NUM, param );
+
+    if ( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    vsi_nn_kernel_param_release( &param );
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    /*TODO: Check tensor shapes. */
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        outputs[0]->attr.dim_num = 1;
+        outputs[0]->attr.size[0] = self->nn_param.nms.max_output_size;
+    }
+
+    if ( VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num )
+    {
+        outputs[1]->attr.dim_num = 1;
+        outputs[1]->attr.size[0] = self->nn_param.nms.max_output_size;
+    }
+
+    if ( VSI_NN_DIM_AUTO == outputs[2]->attr.dim_num )
+    {
+        outputs[0]->attr.dim_num = 1;
+        outputs[0]->attr.size[0] = 1;
+    }
+
+    return TRUE;
+} /* op_setup() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ NMS,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c b/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c
new file mode 100644
index 0000000..bddcc12
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_one_hot.c
@@ -0,0 +1,176 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_constraint_check.h"
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t * param = NULL;
+
+    param = vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_int32( param, "depth", self->nn_param.one_hot.depth );
+    vsi_nn_kernel_param_add_float32( param, "on_value", self->nn_param.one_hot.on_value );
+    vsi_nn_kernel_param_add_float32( param, "off_value", self->nn_param.one_hot.off_value );
+    vsi_nn_kernel_param_add_int32( param, "axis", self->nn_param.one_hot.axis );
+
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "one_hot",
+        inputs, 1, outputs, 1, param );
+
+    if( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    vsi_nn_kernel_param_release( &param );
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(ONE_HOT, 1, 1)
+        /* IO_TYPE(INPUT, OUTPUT) */
+        IO_TYPE(D_F32, D_F32)
+        IO_TYPE(D_F32, D_F16)
+
+        IO_TYPE(D_F16, D_F32)
+        IO_TYPE(D_F16, D_F16)
+        IO_TYPE(D_F16, D_U8|Q_ASYM)
+        IO_TYPE(D_F16, D_I8|Q_DFP)
+        IO_TYPE(D_F16, D_I16|Q_DFP)
+
+        IO_TYPE(D_I32, D_F32)
+        IO_TYPE(D_I32, D_F16)
+        IO_TYPE(D_I32, D_U8|Q_ASYM)
+        IO_TYPE(D_I32, D_I8|Q_DFP)
+        IO_TYPE(D_I32, D_I16|Q_DFP)
+        IO_TYPE(D_I32, D_I32)
+
+        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM, D_F16)
+
+        IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM, D_F16)
+
+        IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP, D_F16)
+
+        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP, D_F16)
+    END_IO_TYPE_DECL(ONE_HOT)
+    if (!VALIDATE_OP_IO_TYPES(ONE_HOT, self, inputs, self->input.num, outputs, self->output.num))
+    {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_one_hot_param* p = &self->nn_param.one_hot;
+    int32_t i = 0;
+    int32_t axis = p->axis;
+    int32_t depth = p->depth;
+
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num + 1;
+        axis = (axis == -1) ? 0 : axis;
+
+        for (i = 0; i < (int32_t)outputs[0]->attr.dim_num; i++)
+        {
+            if ( i < axis)
+            {
+                outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+            }
+            else if ( i == axis)
+            {
+                outputs[0]->attr.size[i] = depth;
+            }
+            else
+            {
+                outputs[0]->attr.size[i] = inputs[0]->attr.size[i - 1];
+            }
+        }
+    }
+
+    return TRUE;
+} /* op_setup() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ ONE_HOT,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c
index 5ec4a6c..d90d7a2 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c
@@ -22,6 +22,7 @@
 *
 *****************************************************************************/
 #include <string.h>
+#include <stdlib.h>
 
 #include "vsi_nn_types.h"
 #include "vsi_nn_platform.h"
@@ -35,6 +36,32 @@
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_constraint_check.h"
 
+static vsi_bool _is_pool1d
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs
+    )
+{
+    /*
+        support pool1d from version 1.1.31
+    */
+    if (vsi_nn_compareVersion(self->graph, 1, 1, 31) == -1)
+    {
+        return FALSE;
+    }
+    else
+    {
+        if ( 3 == inputs[0]->attr.dim_num )
+        {
+            return TRUE;
+        }
+        else
+        {
+            return FALSE;
+        }
+    }
+}
+
 static vsi_status op_compute
     (
     vsi_nn_node_t * self,
@@ -44,26 +71,53 @@ static vsi_status op_compute
 {
     vsi_status status;
     vx_nn_pooling_params_ext_t params;
+    vsi_nn_tensor_t * tmp_inputs[1]  = {NULL};
+    vsi_nn_tensor_t * tmp_outputs[1] = {NULL};
+    vsi_nn_pool_lcl_data *local = self->nn_param.pool.local;
+
     status = VSI_FAILURE;
 
     memset( &params, 0, sizeof( params ) );
-    params.base.pool_type = self->nn_param.pool.type;
-    params.base.pool_size_x = self->nn_param.pool.ksize[0];
-    params.base.pool_size_y = self->nn_param.pool.ksize[1];
-    params.base.pool_pad_x_left = self->nn_param.pool.pad[0];
-    params.base.pool_pad_x_right = self->nn_param.pool.pad[1];
-    params.base.pool_pad_y_top = self->nn_param.pool.pad[2];
-    params.base.pool_pad_y_bottom = self->nn_param.pool.pad[3];
-    params.base.rounding = self->vx_param.down_scale_size_rounding;
-    params.stride_x = self->nn_param.pool.stride[0];
-    params.stride_y = self->nn_param.pool.stride[1];
+    if(_is_pool1d(self, inputs))
+    {
+        // pool1d
+        tmp_inputs[0]  = local->reshaped_input;
+        tmp_outputs[0] = local->reshaped_output;
+
+        params.base.pool_type = self->nn_param.pool.type;
+        params.base.pool_size_x = self->nn_param.pool.ksize[0];
+        params.base.pool_size_y = 1;
+        params.base.pool_pad_x_left = self->nn_param.pool.pad[0];
+        params.base.pool_pad_x_right = self->nn_param.pool.pad[1];
+        params.base.pool_pad_y_top = 0;
+        params.base.pool_pad_y_bottom = 0;
+        params.base.rounding = self->vx_param.down_scale_size_rounding;
+        params.stride_x = self->nn_param.pool.stride[0];
+        params.stride_y = 1;
+    }
+    else
+    {
+        tmp_inputs[0] = inputs[0];
+        tmp_outputs[0] = outputs[0];
+
+        params.base.pool_type = self->nn_param.pool.type;
+        params.base.pool_size_x = self->nn_param.pool.ksize[0];
+        params.base.pool_size_y = self->nn_param.pool.ksize[1];
+        params.base.pool_pad_x_left = self->nn_param.pool.pad[0];
+        params.base.pool_pad_x_right = self->nn_param.pool.pad[1];
+        params.base.pool_pad_y_top = self->nn_param.pool.pad[2];
+        params.base.pool_pad_y_bottom = self->nn_param.pool.pad[3];
+        params.base.rounding = self->vx_param.down_scale_size_rounding;
+        params.stride_x = self->nn_param.pool.stride[0];
+        params.stride_y = self->nn_param.pool.stride[1];
+    }
 
     self->n = vxPoolingLayer2(
         self->graph->g,
-        inputs[0]->t,
+        tmp_inputs[0]->t,
         (vx_nn_pooling_params_t *)&params,
         sizeof( params ),
-        outputs[0]->t
+        tmp_outputs[0]->t
         );
 
     if( NULL != self->n )
@@ -73,6 +127,65 @@ static vsi_status op_compute
     return status;
 } /* op_compute() */
 
+static vsi_status op_optimize
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    vsi_nn_opt_direction_e direction
+    )
+{
+    uint32_t dim = 0;
+    vsi_nn_pool_lcl_data *local = NULL;
+    uint32_t shape[VSI_NN_MAX_DIM_NUM];
+    char tensor_name[128];
+
+    dim = inputs[0]->attr.dim_num;
+    if(FALSE == _is_pool1d(self, inputs))
+    {
+        return VSI_SUCCESS;
+    }
+
+    VSILOGD("Optimize pool1d %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
+    /*
+        insert a reshape node before and after pool1d
+    */
+    local = self->nn_param.pool.local;
+    if (VSI_NN_OPTIMIZE_FORWARD == direction)
+    {
+        /* reshape 3d input (xcn) --> 4d input (whcn) */
+        shape[0] = inputs[0]->attr.size[0];//width
+        shape[1] = 1;//height
+        shape[2] = inputs[0]->attr.size[1];
+        shape[3] = inputs[0]->attr.size[2];
+        dim = 4;
+        local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, dim);
+    }
+    else
+    {
+        /* reshape 3d output(xcn) --> 4d output(whcn) */
+        shape[0] = outputs[0]->attr.size[0];//width
+        shape[1] = 1;//height
+        shape[2] = outputs[0]->attr.size[1];
+        shape[3] = outputs[0]->attr.size[2];
+        dim = 4;
+        local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, dim);
+        if(local->reshaped_output && local->reshaped_output->t)
+        {
+            memset(tensor_name, 0, sizeof(tensor_name));
+            snprintf(tensor_name, sizeof(tensor_name), "uid_%u_reshape_out_0", self->uid);
+            if(vxSetReferenceName((vx_reference)local->reshaped_output->t, tensor_name) == VSI_FAILURE)
+            {
+                VSILOGW("Set uid %u pool1d reshaped output name fail", self->uid);
+                return VSI_FAILURE;
+            }
+        }
+    }
+
+    return VSI_SUCCESS;
+} /* op_optimize() */
+
+
 static vsi_bool op_check
     (
     vsi_nn_node_t * self,
@@ -119,6 +232,54 @@ static vsi_bool op_check
     return TRUE;
 } /* op_check() */
 
+static vsi_status op_init
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    self->nn_param.pool.local =
+    (vsi_nn_pool_lcl_data *)malloc(sizeof(vsi_nn_pool_lcl_data));
+    if (NULL == self->nn_param.pool.local)
+    {
+        return  VX_ERROR_NO_MEMORY;
+    }
+
+    memset( self->nn_param.pool.local, 0, sizeof(vsi_nn_pool_lcl_data) );
+
+    self->nn_param.pool.local->reshaped_input = NULL;
+    self->nn_param.pool.local->reshaped_output = NULL;
+
+    return status;
+} /* op_init() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_nn_pool_param *p = &(self->nn_param.pool);
+    if(p->local->reshaped_input)
+    {
+        vsi_nn_ReleaseTensor(&(p->local->reshaped_input));
+        p->local->reshaped_input = NULL;
+    }
+    if(p->local->reshaped_output)
+    {
+        vsi_nn_ReleaseTensor(&(p->local->reshaped_output));
+        p->local->reshaped_output = NULL;
+    }
+    if(self->nn_param.pool.local)
+    {
+        free(self->nn_param.pool.local);
+        self->nn_param.pool.local = NULL;
+    }
+    vsi_nn_op_common_deinit(self);
+
+    return VSI_SUCCESS;
+} /* op_deinit() */
+
 static vsi_bool op_setup
     (
     vsi_nn_node_t * self,
@@ -129,38 +290,69 @@ static vsi_bool op_setup
     vsi_bool ret;
 
     ret = TRUE;
-    vsi_nn_compute_padding(
-        inputs[0]->attr.size,
-        self->nn_param.pool.ksize,
-        self->nn_param.pool.stride,
-        NULL,
-        self->nn_param.pool.pad_type,
-        self->nn_param.pool.pad
-    );
 
-    /* Pooling */
-    outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize
-        (
-        inputs[0]->attr.size[0],
-        self->nn_param.pool.ksize[0],
-        &self->nn_param.pool.pad[0],
-        self->nn_param.pool.stride[0],
-        0,
-        self->nn_param.pool.round_type
+    if(_is_pool1d(self, inputs))
+    {
+        vsi_nn_compute_padding_conv1d(
+            inputs[0]->attr.size,
+            self->nn_param.pool.ksize,
+            self->nn_param.pool.stride,
+            NULL,
+            self->nn_param.pool.pad_type,
+            self->nn_param.pool.pad
         );
-    outputs[0]->attr.size[1] = vsi_nn_ComputeFilterSize
-        (
-        inputs[0]->attr.size[1],
-        self->nn_param.pool.ksize[1],
-        &self->nn_param.pool.pad[2],
-        self->nn_param.pool.stride[1],
-        0,
-        self->nn_param.pool.round_type
+
+        /* Pooling */
+        outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize
+            (
+            inputs[0]->attr.size[0],
+            self->nn_param.pool.ksize[0],
+            &self->nn_param.pool.pad[0],
+            self->nn_param.pool.stride[0],
+            0,
+            self->nn_param.pool.round_type
+            );
+
+        outputs[0]->attr.size[1] = inputs[0]->attr.size[1];
+        outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
+    }
+    else
+    {
+        vsi_nn_compute_padding(
+            inputs[0]->attr.size,
+            self->nn_param.pool.ksize,
+            self->nn_param.pool.stride,
+            NULL,
+            self->nn_param.pool.pad_type,
+            self->nn_param.pool.pad
         );
 
+        /* Pooling */
+        outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize
+            (
+            inputs[0]->attr.size[0],
+            self->nn_param.pool.ksize[0],
+            &self->nn_param.pool.pad[0],
+            self->nn_param.pool.stride[0],
+            0,
+            self->nn_param.pool.round_type
+            );
+
+        outputs[0]->attr.size[1] = vsi_nn_ComputeFilterSize
+            (
+            inputs[0]->attr.size[1],
+            self->nn_param.pool.ksize[1],
+            &self->nn_param.pool.pad[2],
+            self->nn_param.pool.stride[1],
+            0,
+            self->nn_param.pool.round_type
+            );
+
+        outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
+        outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
+    }
+
     outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
-    outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
-    outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
     if( NULL != outputs[1] )
     {
         outputs[1]->attr.dim_num = outputs[0]->attr.dim_num;
@@ -178,12 +370,12 @@ extern "C" {
 DEF_OP_REG
     (
     /* op_name    */ POOL,
-    /* init       */ NULL,
+    /* init       */ op_init,
     /* compute    */ op_compute,
-    /* deinit     */ vsi_nn_op_common_deinit,
+    /* deinit     */ op_deinit,
     /* check      */ op_check,
     /* setup      */ op_setup,
-    /* optimize   */ NULL,
+    /* optimize   */ op_optimize,
     /* input_num  */ 1,
     /* output_num */ 1
     );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c
index 6171449..bc8c3de 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c
@@ -34,7 +34,7 @@
 #include "utils/vsi_nn_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
 #include "utils/vsi_nn_constraint_check.h"
 
@@ -244,6 +244,7 @@ static vsi_bool op_setup
     )
 {
     vsi_bool ret = TRUE;
+
     vsi_nn_compute_padding(
         inputs[0]->attr.size,
         self->nn_param.pool.ksize,
@@ -266,17 +267,6 @@ static vsi_status op_deinit
     vsi_nn_node_t * self
     )
 {
-    uint32_t i;
-
-    for (i = 0; i < _VSI_NN_POOLWITHARGMAX_LOCAL_TENSOR_NUM; i++)
-    {
-        if (self->nn_param.pool.local.local_tensor[i] != NULL)
-        {
-            vxReleaseTensor(&(self->nn_param.pool.local.local_tensor[i]));
-            self->nn_param.pool.local.local_tensor[i] = NULL;
-        }
-    }
-
     vsi_nn_op_common_deinit(self);
 
     return VSI_SUCCESS;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c
index a50c1b3..a198d32 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "vsi_nn_internal_node.h"
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
index ccb0510..473b900 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "vsi_nn_internal_node.h"
 #include "utils/vsi_nn_util.h"
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c
index 4ac9bb1..10b7260 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
@@ -88,6 +88,7 @@ static vsi_bool op_check
 {
     BEGIN_IO_TYPE_DECL(PRE_PROCESS_BGRA, 1, 1)
         IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
+        IO_TYPE(D_U8,  D_U8|Q_ASYM)
     END_IO_TYPE_DECL(PRE_PROCESS_BGRA)
     if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_BGRA, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c
index 80797a2..ebebc54 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
@@ -87,6 +87,10 @@ static vsi_bool op_check
         IO_TYPE(D_U8|Q_ASYM,  D_I8|Q_DFP)
         IO_TYPE(D_U8|Q_ASYM,  D_I16|Q_DFP)
         IO_TYPE(D_U8|Q_ASYM,  D_F16)
+        IO_TYPE(D_U8,  D_U8|Q_ASYM)
+        IO_TYPE(D_U8,  D_I8|Q_DFP)
+        IO_TYPE(D_U8,  D_I16|Q_DFP)
+        IO_TYPE(D_U8,  D_F16)
     END_IO_TYPE_DECL(PRE_PROCESS_GRAY)
     if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_GRAY, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c
index d754e27..f11ed8e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c
@@ -87,6 +87,10 @@ static vsi_bool op_check
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP)
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP)
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16)
+        IO_TYPE(D_U8, D_U8, D_U8|Q_ASYM)
+        IO_TYPE(D_U8, D_U8, D_I8|Q_DFP)
+        IO_TYPE(D_U8, D_U8, D_I16|Q_DFP)
+        IO_TYPE(D_U8, D_U8, D_F16)
     END_IO_TYPE_DECL(PRE_PROCESS_NV12)
     if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_NV12, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c
index a31005d..a8ce9be 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
@@ -91,6 +91,10 @@ static vsi_bool op_check
         IO_TYPE(D_U8|Q_ASYM,  D_I8|Q_DFP)
         IO_TYPE(D_U8|Q_ASYM,  D_I16|Q_DFP)
         IO_TYPE(D_U8|Q_ASYM,  D_F16)
+        IO_TYPE(D_U8,  D_U8|Q_ASYM)
+        IO_TYPE(D_U8,  D_I8|Q_DFP)
+        IO_TYPE(D_U8,  D_I16|Q_DFP)
+        IO_TYPE(D_U8,  D_F16)
     END_IO_TYPE_DECL(PRE_PROCESS_RGB)
     if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_RGB, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c
index c1536be..ba50f33 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "vsi_nn_internal_node.h"
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c
index 50c2355..4a7eb22 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c
@@ -87,6 +87,10 @@ static vsi_bool op_check
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP)
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP)
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16)
+        IO_TYPE(D_U8, D_U8, D_U8, D_U8|Q_ASYM)
+        IO_TYPE(D_U8, D_U8, D_U8, D_I8|Q_DFP)
+        IO_TYPE(D_U8, D_U8, D_U8, D_I16|Q_DFP)
+        IO_TYPE(D_U8, D_U8, D_U8, D_F16)
     END_IO_TYPE_DECL(PRE_PROCESS_YUV420)
     if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_YUV420, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c
index 99a7674..296e245 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c
@@ -87,6 +87,10 @@ static vsi_bool op_check
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP)
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP)
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16)
+        IO_TYPE(D_U8, D_U8, D_U8, D_U8|Q_ASYM)
+        IO_TYPE(D_U8, D_U8, D_U8, D_I8|Q_DFP)
+        IO_TYPE(D_U8, D_U8, D_U8, D_I16|Q_DFP)
+        IO_TYPE(D_U8, D_U8, D_U8, D_F16)
     END_IO_TYPE_DECL(PRE_PROCESS_YUV444)
     if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_YUV444, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c
index f51e4b2..07f074e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c
@@ -132,7 +132,7 @@ static vsi_status _prelu_op_compute
 
         if (one_rank)
         {
-            is_per_channel_alpha = (inputs[1]->attr.dim_num > 2 && alpha_shape == inputs[0]->attr.size[2]);
+            is_per_channel_alpha = (inputs[1]->attr.dim_num > 2 && alpha_shape == inputs[1]->attr.size[2]);
         }
 
         if (is_per_channel_alpha)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c
index c5bf9d2..4ea879f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "vsi_nn_internal_node.h"
 
 static vsi_status op_compute
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
index 3d01e79..cc07d0a 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
@@ -33,7 +33,7 @@
 #include "utils/vsi_nn_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "vsi_nn_internal_node.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_internal.c
index 41cc43b..1bcc83f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_internal.c
@@ -322,6 +322,8 @@ static vsi_bool op_check_reducemax_internal
         IO_TYPE(D_F16,  D_F16)
         IO_TYPE(D_F32,  D_F32)
         IO_TYPE(D_I32,  D_I32)
+        IO_TYPE(D_I32,  D_I16)
+        IO_TYPE(D_I16,  D_I32)
         IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,  D_F16)
         IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c
index 3749f8a..a77d54e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 
 static vsi_status op_compute
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c
index 21e9bf3..2f08f5e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c
@@ -145,6 +145,21 @@ static vsi_bool op_check
         IO_TYPE(D_BOOL8, D_BOOL8, D_BOOL8)
         IO_TYPE(D_F32, D_F32, D_BOOL8)
         IO_TYPE(D_I32, D_I32, D_BOOL8)
+
+        IO_TYPE(D_F16,  D_F16, D_I8)
+        IO_TYPE(D_F16,  D_I16|Q_DFP, D_I8)
+        IO_TYPE(D_F16,  D_I8|Q_DFP, D_I8)
+        IO_TYPE(D_F16,  D_U8|Q_ASYM, D_I8)
+        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP, D_I8)
+        IO_TYPE(D_I16|Q_DFP,  D_F16, D_I8)
+        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP, D_I8)
+        IO_TYPE(D_I8|Q_DFP,  D_F16, D_I8)
+        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM, D_I8)
+        IO_TYPE(D_U8|Q_ASYM,  D_F16, D_I8)
+        IO_TYPE(D_BF16,  D_BF16,  D_I8)
+        IO_TYPE(D_BOOL8, D_BOOL8, D_I8)
+        IO_TYPE(D_F32, D_F32, D_I8)
+        IO_TYPE(D_I32, D_I32, D_I8)
     END_IO_TYPE_DECL(RELATIONAL_OPS)
     if(!VALIDATE_OP_IO_TYPES(RELATIONAL_OPS, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c
index 5b312cb..8c40d42 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "vsi_nn_internal_node.h"
 #include "utils/vsi_nn_util.h"
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras_internal.c
index 8c2c914..aa5e8f5 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras_internal.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c
new file mode 100644
index 0000000..3200fe5
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c
@@ -0,0 +1,340 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_platform.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_node.h"
+#include "utils/vsi_nn_math.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_log.h"
+#include "libnnext/vsi_nn_vxkernel.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+#define _INPUT_NUM          (2)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status _create_local_tensor
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    int32_t* repeat_host = self->nn_param.repeat.repeat_host;
+    int32_t  axis = self->nn_param.repeat.axis;
+    vsi_nn_repeat_lcl_data *local = self->nn_param.repeat.local;
+    uint32_t shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
+    uint32_t i = 0;
+
+    if (axis == -1)
+    {
+        axis = 0;
+        for(i = 0; i < inputs[0]->attr.dim_num; i++)
+        {
+            shape[0] *= inputs[0]->attr.size[i];
+        }
+
+        local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, 1);
+
+        shape[0] = 1;
+        for(i = 0; i < outputs[0]->attr.dim_num; i++)
+        {
+            shape[0] *= outputs[0]->attr.size[i];
+        }
+        local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, 1);
+    }
+
+    if (repeat_host)
+    {
+        vsi_nn_tensor_attr_t attr;
+        int32_t len = 0;
+
+        if (self->nn_param.repeat.axis < 0)
+        {
+            len = local->reshaped_input->attr.size[0];
+        }
+        else if (axis == 1 || inputs[0]->attr.dim_num == 1)
+        {
+            len = inputs[0]->attr.size[0];
+        }
+        else if (axis == 0)
+        {
+            len = inputs[0]->attr.size[1];
+        }
+        else if (axis == 2)
+        {
+            len = inputs[0]->attr.size[2];
+        }
+
+        memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
+        attr.dtype.vx_type = VSI_NN_TYPE_INT32;
+        attr.is_const = FALSE;
+        attr.vtl = TRUE;
+        attr.size[0] = len;
+        attr.size[1] = 1;
+        attr.dim_num = 2;
+
+        local->repeat_tensor = vsi_nn_CreateTensorFromData(self->graph, (uint8_t*)repeat_host, &attr);
+    }
+
+    return VSI_SUCCESS;
+}
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t * param = NULL;
+    vsi_nn_kernel_node_t n = NULL;
+    int32_t  axis = self->nn_param.repeat.axis;
+    vsi_nn_tensor_t * tmp_inputs[2]  = {NULL, NULL};
+    vsi_nn_tensor_t * tmp_output[1]  = {NULL};
+    vsi_nn_repeat_lcl_data *local = self->nn_param.repeat.local;
+
+    status = _create_local_tensor(self, inputs, outputs);
+    if (status != VSI_SUCCESS)
+    {
+        VSILOGE("Create local tensor fail");
+        return status;
+    }
+
+    if (local->reshaped_input)
+    {
+        tmp_inputs[0] = local->reshaped_input;
+        tmp_output[0] = local->reshaped_output;
+    }
+    else
+    {
+        tmp_inputs[0] = inputs[0];
+        tmp_output[0] = outputs[0];
+    }
+
+    if (local->repeat_tensor)
+    {
+        tmp_inputs[1] = local->repeat_tensor;
+    }
+    else
+    {
+        tmp_inputs[1] = inputs[1];
+    }
+
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_int32( param, "axis", axis );
+    n = vsi_nn_kernel_selector( self->graph, "repeat",
+                    tmp_inputs, _INPUT_NUM, tmp_output, _OUTPUT_NUM, param );
+    if ( n != NULL )
+    {
+        self->n = (vx_node)n;
+        status = VSI_SUCCESS;
+    }
+
+    if (param != NULL)
+    {
+        vsi_nn_kernel_param_release( &param );
+    }
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_repeat_param * p = NULL;
+
+    BEGIN_IO_TYPE_DECL(REPEAT, 2, 1)
+        IO_TYPE(D_F16,  D_I32,  D_F16)
+        IO_TYPE(D_F32,  D_I32,  D_F32)
+        IO_TYPE(D_I32,  D_I32,  D_I32)
+        IO_TYPE(D_I8,   D_I32,  D_I8)
+        IO_TYPE(D_U8,   D_I32,  D_U8)
+        IO_TYPE(D_I16,  D_I32,  D_I16)
+        IO_TYPE(D_I8|Q_DFP,  D_I32,  D_I8|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM, D_I32,  D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP, D_I32,  D_I16|Q_DFP)
+    END_IO_TYPE_DECL(REPEAT)
+    if (!VALIDATE_OP_IO_TYPES(REPEAT, self, inputs, self->input.num, outputs, self->output.num))
+    {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    p = (vsi_nn_repeat_param *)&(self->nn_param.repeat);
+    if ((p->repeat_host == NULL && p->maxlen < 1) || p->axis > 3)
+    {
+        VSILOGE("Unsupported parameters");
+    }
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_repeat_param * p = NULL;
+    int32_t i = 0;
+    int32_t sum = 0;
+    int32_t axis = 0;
+    p = (vsi_nn_repeat_param *)&(self->nn_param.repeat);
+    axis = p->axis;
+
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        for(i = 0; i < (int32_t)inputs[0]->attr.dim_num; i++)
+        {
+            outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+        }
+
+        if (p->repeat_host)
+        {
+            for(i = 0; i < p->repeat_len; i++)
+            {
+                sum += p->repeat_host[i];
+            }
+        }
+        else
+        {
+            sum = p->maxlen;
+        }
+
+        if (inputs[0]->attr.dim_num == 1 || axis == -1 || axis == 1)
+        {
+            outputs[0]->attr.size[0] = sum;
+        }
+        else if (axis == 0)
+        {
+            outputs[0]->attr.size[1] = sum;
+        }
+        else if (axis == 2)
+        {
+            outputs[0]->attr.size[2] = sum;
+        }
+        else if (axis == 3)
+        {
+            outputs[0]->attr.size[3] = sum;
+        }
+    }
+
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    self->nn_param.repeat.local =
+    (vsi_nn_repeat_lcl_data *)malloc(sizeof(vsi_nn_repeat_lcl_data));
+    if (NULL == self->nn_param.repeat.local)
+    {
+        return  VX_ERROR_NO_MEMORY;
+    }
+
+    memset( self->nn_param.repeat.local, 0, sizeof(vsi_nn_repeat_lcl_data) );
+
+    self->nn_param.repeat.local->reshaped_input = NULL;
+    self->nn_param.repeat.local->reshaped_output = NULL;
+    self->nn_param.repeat.local->repeat_tensor = NULL;
+    self->nn_param.repeat.repeat_host = NULL;
+    self->nn_param.repeat.repeat_len = 0;
+    self->nn_param.repeat.axis = -1;
+    self->nn_param.repeat.maxlen = -1;
+
+    return status;
+} /* op_init() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_nn_repeat_param *p = &(self->nn_param.repeat);
+    if (p->local->reshaped_input)
+    {
+        vsi_nn_ReleaseTensor(&(p->local->reshaped_input));
+        p->local->reshaped_input = NULL;
+    }
+    if (p->local->reshaped_output)
+    {
+        vsi_nn_ReleaseTensor(&(p->local->reshaped_output));
+        p->local->reshaped_output = NULL;
+    }
+    if (p->local->repeat_tensor)
+    {
+        vsi_nn_ReleaseTensor(&(p->local->repeat_tensor));
+        p->local->repeat_tensor = NULL;
+    }
+    if (self->nn_param.repeat.local)
+    {
+        free(self->nn_param.repeat.local);
+        self->nn_param.repeat.local = NULL;
+    }
+    vsi_nn_op_common_deinit(self);
+
+    return VSI_SUCCESS;
+} /* op_deinit() */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ REPEAT,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
index 51ea588..255388e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
@@ -41,7 +41,7 @@
 #include "utils/vsi_nn_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "vsi_nn_internal_node.h"
 
 #define _ARG_NUM            (1)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c
index 0a7f893..bd761d0 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_constraint_check.h"
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_nearest_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_nearest_internal.c
index a0e0d48..f46b561 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_nearest_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_nearest_internal.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_constraint_check.h"
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c
index 86b9ad3..a77de72 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reverse.c
@@ -1,4 +1,3 @@
-
 /****************************************************************************
 *
 *    Copyright (c) 2020 Vivante Corporation
@@ -35,221 +34,33 @@
 #include "utils/vsi_nn_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_constraint_check.h"
 
-#define _ARG_NUM            (1)
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
 #define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
 #define _PARAM_NUM          (_ARG_NUM + _IO_NUM)
 
-#define USE_OVX_API TRUE
-
-#if (USE_OVX_API == FALSE)
-extern vx_kernel_description_t * vx_kernel_REVERSE_list[];
-
-static void _set_inputs_outputs
+static vsi_bool _is_same_quant
     (
-    vx_reference * params,
     vsi_nn_tensor_t ** inputs,
     vsi_nn_tensor_t ** outputs
     )
 {
-    uint32_t i;
-    uint32_t cnt;
+    vsi_nn_dtype_t *src_dtype = NULL,*dst_dtype = NULL;
 
-    /* Set inputs */
-    cnt = 0;
-    for( i = 0; i < _INPUT_NUM; i ++, cnt ++ )
+    src_dtype = &inputs[0]->attr.dtype;
+    dst_dtype = &outputs[0]->attr.dtype;
+
+    if (vsi_nn_DtypeCompare(src_dtype, dst_dtype) == FALSE)
     {
-        params[cnt] = (vx_reference)inputs[i]->t;
+        return FALSE;
     }
 
-    /* Set outputs */
-    for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)outputs[i]->t;
-    }
-} /* _set_inputs_outputs() */
+    return TRUE;
+} /* _is_same_quant */
 
-static vsi_status _create_params
-    (
-    vsi_nn_node_t * node,
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    vsi_status status;
-    vx_context ctx;
-    vsi_nn_reverse_param * p;
-    if( 0 == num )
-    {
-        return VSI_SUCCESS;
-    }
-    memset( params, 0, sizeof( vx_reference * ) * num );
-    p = &node->nn_param.reverse;
-    ctx = vxGetContext( (vx_reference)node->graph->g );
-    /* Init parameters */
-#define _SET_PARAM( i, type, arg ) do{ \
-    params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \
-    status = vxGetStatus( params[i] ); \
-    if( VSI_SUCCESS != status ) { \
-    goto set_param_error; \
-    } \
-    } while(0)
-    _SET_PARAM( 0, VX_TYPE_INT32, axis[0] );
-#undef _SET_PARAM
-set_param_error:
-
-    return status;
-} /* _create_params */
-
-static void _release_params
-    (
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    uint32_t i;
-    vx_scalar scalar;
-    for( i = 0; i < num; i ++ )
-    {
-        scalar = (vx_scalar)params[i];
-        vxReleaseScalar( &scalar );
-    }
-} /* _release_params() */
-
-static vsi_status cpu_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_reference * args;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    return status;
-}
-
-static vsi_status op_pre_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs,
-    vsi_nn_kernel_info_t * kernel_info
-    )
-{
-    vsi_nn_type_e in_dataType = inputs[0]->attr.dtype.vx_type;
-    vsi_nn_type_e out_dataType = outputs[0]->attr.dtype.vx_type;
-    uint32_t i;
-    uint32_t changed_num = 1;
-
-    for( i = self->nn_param.reverse.axis[0] + 1; i < inputs[0]->attr.dim_num; i++ )
-    {
-        changed_num *= inputs[0]->attr.size[inputs[0]->attr.dim_num - 1 - i];
-    }
-
-    if ((in_dataType != VSI_NN_TYPE_INT16 || out_dataType != VSI_NN_TYPE_INT16)
-        && self->nn_param.reverse.axis[0] != 0)
-    {
-        VSILOGE("tensorReverse shader unsupport format or axis:%d!\n",
-            self->nn_param.reverse.axis[0]);
-        return VSI_FAILURE;
-    }
-    else if (changed_num >= 65536)
-    {
-        VSILOGE("tensorReverse unsupport change num:%d!\n", changed_num);
-        return VSI_FAILURE;
-    }
-
-    kernel_info->kernel_index = 1;
-
-    return VSI_SUCCESS;
-}
-
-static void reshape_tensor_shape
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t * input,
-    vx_reference * params,
-    uint32_t index
-    )
-{
-    uint32_t i;
-    int32_t size[4] = {0};
-    int32_t size0[4] = {1, 1, 1, 1};
-    uint32_t dims = 2;
-
-    for( i = 0; i < input->attr.dim_num; i++ )
-    {
-        size0[i] = input->attr.size[i];
-    }
-
-    size[0] = size0[0] * size0[1] * size0[2];
-    size[1] = size0[3];
-
-    self->nn_param.reverse.local.local_tensor[index] =
-        vxReshapeTensor(input->t, size, dims);
-    params[index] = (vx_reference)self->nn_param.reverse.local.local_tensor[index];
-}
-
-static vsi_status vx_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_border_t border;
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    reshape_tensor_shape(self, inputs[0], params, 0);
-    reshape_tensor_shape(self, outputs[0], params, 1);
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    border.mode = VX_BORDER_REPLICATE;
-    border.constant_value.U32 = 0;
-    status |= vxSetNodeAttribute(self->n, VX_NODE_BORDER, &border, sizeof(border));
-
-    return status;
-}
-
-static vsi_nn_op_compute_t op_compute_list[] =
-{
-    cpu_op_compute,
-    vx_op_compute,
-    NULL
-};
-#endif
 static vsi_status op_compute
     (
     vsi_nn_node_t * self,
@@ -258,45 +69,30 @@ static vsi_status op_compute
     )
 {
     vsi_status status = VSI_FAILURE;
-#if (USE_OVX_API == TRUE)
+
     vx_nn_tensor_reverse_params_t para;
     vsi_nn_reverse_param * p;
     int32_t axes[VSI_NN_MAX_DIM_NUM] = {0};
-    p = &self->nn_param.reverse;
-    memcpy(axes, p->axis, sizeof(int32_t) * p->axis_num);
-    para.axis = axes;
-    para.numberOfAxis = p->axis_num;
-    self->n = vxTensorReverse( self->graph->g, inputs[0]->t, &para,
-        sizeof(vx_nn_tensor_reverse_params_t), outputs[0]->t );
-    if( NULL != self->n )
-    {
-        status = VSI_SUCCESS;
-    }
-#else
-    vsi_nn_kernel_info_t kernel_info;
-    memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t));
-    kernel_info.resource_num = 1;
-    kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *));
-    kernel_info.resource_name[0] = "vsi_nn_kernel_reverse";
-    kernel_info.type = vsi_nn_GetVXKernelTypeForShader();
-    kernel_info.kernel = vx_kernel_REVERSE_list;
-    kernel_info.init_index = 1;
 
-    op_pre_compute(self, inputs, outputs, &kernel_info);
+    if ( _is_same_quant(inputs, outputs) )
+    {
+        p = &self->nn_param.reverse;
+        memcpy(axes, p->axis, sizeof(int32_t) * p->axis_num);
+        para.axis = axes;
+        para.numberOfAxis = p->axis_num;
+        self->n = vxTensorReverse( self->graph->g, inputs[0]->t, &para,
+            sizeof(vx_nn_tensor_reverse_params_t), outputs[0]->t );
+        if( NULL != self->n )
+        {
+            status = VSI_SUCCESS;
+        }
 
-    self->n = vsi_nn_RegisterClientKernelAndNewNode(
-        self->graph, &kernel_info);
-    if (kernel_info.resource_name) free(kernel_info.resource_name);
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
+        return status;
     }
-    if (NULL != op_compute_list[kernel_info.init_index])
+    else
     {
-        status = op_compute_list[kernel_info.init_index](self, inputs, outputs);
+        return vsi_nn_internal_compute_node( self );
     }
-#endif
-    return status;
 } /* op_compute() */
 
 
@@ -328,6 +124,10 @@ static vsi_bool op_check
         IO_TYPE(D_F32,  D_F32)
         IO_TYPE(D_F32,  D_BF16)
         IO_TYPE(D_BF16, D_F32)
+        IO_TYPE(D_F16,        D_I32)
+        IO_TYPE(D_U8|Q_ASYM,  D_I32)
+        IO_TYPE(D_I8|Q_DFP,   D_I32)
+        IO_TYPE(D_I16|Q_DFP,  D_I32)
 
         /* HW 9.0 */
         IO_TYPE(D_BF16, D_BF16)
@@ -347,22 +147,72 @@ static vsi_status op_deinit
     vsi_nn_node_t * self
     )
 {
-#if (USE_OVX_API == FALSE)
-    uint32_t i;
-    for (i = 0; i < _VSI_NN_REVERSE_LOCAL_TENSOR_NUM; i++)
-    {
-        if (self->nn_param.reverse.local.local_tensor[i] != NULL)
-        {
-            vxReleaseTensor(&(self->nn_param.reverse.local.local_tensor[i]));
-            self->nn_param.reverse.local.local_tensor[i] = NULL;
-        }
-    }
-#endif
+    vsi_nn_internal_deinit_node_wksp(self);
     vsi_nn_op_common_deinit(self);
 
     return VSI_SUCCESS;
 } /* op_deinit() */
 
+static vsi_status op_optimize
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    vsi_nn_opt_direction_e direction
+    )
+{
+    if ( _is_same_quant(inputs, outputs) )
+    {
+        return VSI_SUCCESS;
+    }
+    else
+    {
+        return vsi_nn_internal_optimize_node(self, direction );
+    }
+} /* op_optimize() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_bool ret = TRUE;
+    vsi_nn_internal_node_t* curr = NULL;
+
+    vsi_nn_internal_init_node_wksp(self);
+
+    ret = vsi_nn_op_common_setup(self, inputs, outputs);
+
+    if ( _is_same_quant(inputs, outputs) == FALSE )
+    {
+        vsi_nn_internal_tensor_t* output_tensor = NULL;
+        vsi_nn_tensor_attr_t attr;
+        int32_t size = sizeof( attr.size );
+
+        memcpy( &attr, &inputs[0]->attr, sizeof( attr ) );
+        memcpy( &attr.size, &outputs[0]->attr.size, size );
+        attr.vtl = TRUE;
+        attr.is_const = FALSE;
+        output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+
+        curr = vsi_nn_internal_new_node(self, VSI_NN_OP_REVERSE, 0, 0);
+        curr->inputs[0] = inputs[0];
+        curr->outputs[0] = output_tensor->t;
+        curr->node->nn_param.reverse.axis = self->nn_param.reverse.axis;
+        curr->node->nn_param.reverse.axis_num = self->nn_param.reverse.axis_num;
+        vsi_nn_internal_setup_node(self, curr);
+
+        curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0);
+        curr->inputs[0] = output_tensor->t;
+        curr->outputs[0] = outputs[0];
+        vsi_nn_internal_setup_node(self, curr);
+    }
+
+    return ret;
+} /* op_setup() */
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -374,8 +224,8 @@ DEF_OP_REG
     /* compute    */ op_compute,
     /* deinit     */ op_deinit,
     /* check      */ op_check,
-    /* setup      */ vsi_nn_op_common_setup,
-    /* optimize   */ NULL,
+    /* setup      */ op_setup,
+    /* optimize   */ op_optimize,
     /* input_num  */ _INPUT_NUM,
     /* output_num */ _OUTPUT_NUM
     );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c
index a6fa7b8..721da3b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "vsi_nn_internal_node.h"
 #include "vsi_nn_rnn_helper.h"
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c
index 472f994..f754a67 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_align.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "vsi_nn_internal_node.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c
index b7c4056..87a7144 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scale.c
@@ -37,7 +37,7 @@
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_link_list.h"
 #include "utils/vsi_nn_dtype_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 #define _INPUT_NUM          (3)
 #define _OUTPUT_NUM         (1)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_sequence_mask.c b/src/tim/vx/internal/src/ops/vsi_nn_op_sequence_mask.c
new file mode 100644
index 0000000..500e676
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_sequence_mask.c
@@ -0,0 +1,176 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+#define _ARG_NUM            (1)
+#define _INPUT_NUM          (2)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t * param = NULL;
+    vsi_nn_kernel_node_t    n = NULL;
+    int32_t max_len = self->nn_param.sequence_mask.maxlen;
+
+    param =vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_int32( param, "max_len", max_len );
+    n = vsi_nn_kernel_selector( self->graph, "sequence_mask", inputs, 2, outputs, 1, param );
+    if ( n != NULL )
+    {
+        self->n = (vx_node)n;
+        status = VSI_SUCCESS;
+    }
+
+    if (param != NULL)
+    {
+        vsi_nn_kernel_param_release( &param );
+    }
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_sequence_mask_param * p = NULL;
+
+    BEGIN_IO_TYPE_DECL(SEQUENCE_MASK, 2, 1)
+        IO_TYPE(D_U8|Q_ASYM,  D_I32, D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_DFP,   D_I32, D_I8|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,  D_I32, D_I16|Q_DFP)
+        IO_TYPE(D_U8,   D_I32, D_U8)
+        IO_TYPE(D_I8,   D_I32, D_I8)
+        IO_TYPE(D_I16,  D_I32, D_I16)
+        IO_TYPE(D_F16,  D_I32, D_F16)
+        IO_TYPE(D_I32,  D_I32, D_U8|Q_ASYM)
+        IO_TYPE(D_I32,  D_I32, D_BOOL8)
+        IO_TYPE(D_I16,  D_I32, D_BOOL8)
+        IO_TYPE(D_I16,  D_I16, D_BOOL8)
+        IO_TYPE(D_I8,   D_I32, D_BOOL8)
+        IO_TYPE(D_I8,   D_I16, D_BOOL8)
+        IO_TYPE(D_U8,   D_I32, D_BOOL8)
+        IO_TYPE(D_U8,   D_I16, D_BOOL8)
+        IO_TYPE(D_F16,  D_I32, D_BOOL8)
+        IO_TYPE(D_F16,  D_I16, D_BOOL8)
+        IO_TYPE(D_I32,  D_I32, D_I32)
+        IO_TYPE(D_I32,  D_I32, D_I32|Q_DFP)
+        IO_TYPE(D_I32,  D_I32, D_F32)
+
+        IO_TYPE(D_U8|Q_ASYM,  D_NONE, D_U8|Q_ASYM)
+        IO_TYPE(D_I32,  D_NONE, D_U8|Q_ASYM)
+        IO_TYPE(D_I32,  D_NONE, D_BOOL8)
+    END_IO_TYPE_DECL(SEQUENCE_MASK)
+    if (!VALIDATE_OP_IO_TYPES(SEQUENCE_MASK, self, inputs, self->input.num, outputs, self->output.num)) {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    p = &(self->nn_param.sequence_mask);
+    if (p->maxlen < 0)
+    {
+        VSILOGE("Max length must bigger than 1");
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    /* TODO: Add code to comput outputs' shape. */
+    uint32_t i = 0;
+    vsi_nn_sequence_mask_param * p = NULL;
+
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        p = &(self->nn_param.sequence_mask);
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num + 1;
+        outputs[0]->attr.size[0] = p->maxlen;
+        for (i = 0; i < (uint32_t)inputs[0]->attr.dim_num; i++)
+        {
+            outputs[0]->attr.size[i+1] = inputs[0]->attr.size[i];
+        }
+    }
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_nn_op_common_deinit(self);
+
+    return VSI_SUCCESS;
+} /* op_deinit() */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ SEQUENCE_MASK,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c b/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c
index b87e1e6..b45f405 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_test.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
 #define _ARG_NUM            (2)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_signalframe.c b/src/tim/vx/internal/src/ops/vsi_nn_op_signalframe.c
index d51f294..432970b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_signalframe.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_signalframe.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
 #define _ARG_NUM            (5)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c
index ff7ea13..b5ef3e5 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_slice.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "kernel/vsi_nn_kernel.h"
 #include "vsi_nn_internal_node.h"
 #include "utils/vsi_nn_constraint_check.h"
 
@@ -52,7 +52,24 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    return vsi_nn_internal_compute_node( self );
+    if (self->input.num > 1)
+    {
+        vsi_status status = VSI_FAILURE;
+
+        self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "slice",
+            inputs, 2, outputs, _OUTPUT_NUM, NULL );
+
+        if( self->n )
+        {
+            status = VSI_SUCCESS;
+        }
+
+        return status;
+    }
+    else
+    {
+        return vsi_nn_internal_compute_node( self );
+    }
 } /* op_compute() */
 
 static vsi_bool op_check
@@ -64,7 +81,39 @@ static vsi_bool op_check
 {
     vsi_bool ret = FALSE;
 
-    ret = vsi_nn_OpCheck(VSI_NN_OP_STRIDED_SLICE, self, inputs, outputs);
+    if (self->input.num > 1)
+    {
+        BEGIN_IO_TYPE_DECL(SLICE, 2, 1)
+            IO_TYPE(D_F16,       D_I32,  D_F16)
+            IO_TYPE(D_F16,       D_I32,  D_I8|Q_DFP)
+            IO_TYPE(D_F16,       D_I32,  D_I16|Q_DFP)
+            IO_TYPE(D_F16,       D_I32,  D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_DFP,  D_I32,  D_F16)
+            IO_TYPE(D_I16|Q_DFP, D_I32,  D_F16)
+            IO_TYPE(D_U8|Q_ASYM, D_I32,  D_F16)
+            IO_TYPE(D_I8|Q_DFP,  D_I32,  D_I8|Q_DFP)
+            IO_TYPE(D_I16|Q_DFP, D_I32,  D_I16|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM, D_I32,  D_U8|Q_ASYM)
+            IO_TYPE(D_F32,       D_I32,  D_F32)
+            IO_TYPE(D_I32,       D_I32,  D_I32)
+
+            /* HW 9.0 */
+            IO_TYPE(D_BF16,     D_I32,    D_BF16)
+            END_IO_TYPE_DECL(SLICE)
+            if (!VALIDATE_OP_IO_TYPES(SLICE, self, inputs, self->input.num, outputs, self->output.num))
+            {
+                char* desc = generate_op_io_types_desc(inputs,
+                    self->input.num, outputs, self->output.num);
+                VSILOGE("Inputs/Outputs data type not support: %s", desc);
+                destroy_op_io_types_desc(desc);
+                return FALSE;
+            }
+            return TRUE;
+    }
+    else
+    {
+        ret = vsi_nn_OpCheck(VSI_NN_OP_STRIDED_SLICE, self, inputs, outputs);
+    }
 
     return ret;
 } /* op_check() */
@@ -77,7 +126,14 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
-    return vsi_nn_internal_optimize_node( self, direction );
+    if (self->input.num > 1)
+    {
+        return VSI_SUCCESS;
+    }
+    else
+    {
+        return vsi_nn_internal_optimize_node( self, direction );
+    }
 }
 
 static vsi_bool op_setup
@@ -90,15 +146,16 @@ static vsi_bool op_setup
     vsi_nn_slice_param * p;
     vsi_nn_internal_node_t* curr = NULL;
     uint32_t i;
-    if(self->nn_param.slice.dims == 0)
+
+    if (self->nn_param.slice.dims == 0)
     {
         self->nn_param.slice.dims = inputs[0]->attr.dim_num;
     }
 
-    p = (vsi_nn_slice_param *)&(self->nn_param.slice);
     vsi_nn_internal_init_node_wksp( self );
+    p = (vsi_nn_slice_param *)&(self->nn_param.slice);
 
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         for(i = 0; i < p->dims; i++)
         {
@@ -107,6 +164,11 @@ static vsi_bool op_setup
         outputs[0]->attr.dim_num = p->dims;
     }
 
+    if (self->input.num > 1)
+    {
+        return TRUE;
+    }
+
     for (i = 0; i < self->nn_param.slice.dims; i++)
     {
         p->lcl_data->begin_dims[i] = self->nn_param.slice.start[i];
@@ -124,6 +186,7 @@ static vsi_bool op_setup
     curr->node->nn_param.strided_slice.begin_mask = 0;
     curr->node->nn_param.strided_slice.end_mask = 0;
     curr->node->nn_param.strided_slice.shrink_axis_mask = 0;
+    curr->node->nn_param.strided_slice.new_axis_mask = 0;
     curr->inputs[0] = inputs[0];
     curr->outputs[0] = outputs[0];
     vsi_nn_internal_setup_node( self, curr );
@@ -143,7 +206,7 @@ static vsi_status op_init
     p = &(self->nn_param.slice);
 
     p->lcl_data   =
-    (vsi_nn_slice_lcl_data *)malloc(sizeof(vsi_nn_slice_lcl_data));
+        (vsi_nn_slice_lcl_data *)malloc(sizeof(vsi_nn_slice_lcl_data));
     if (NULL == p->lcl_data)
     {
         return  VX_ERROR_NO_MEMORY;
@@ -169,6 +232,7 @@ static vsi_status op_deinit
     }
 
     vsi_nn_internal_deinit_node_wksp( self );
+    vsi_nn_op_common_deinit(self);
 
     return VSI_SUCCESS;
 } /* op_deinit() */
@@ -177,20 +241,19 @@ static vsi_status op_deinit
 #ifdef __cplusplus
 extern "C" {
 #endif
-/* Registrar */
-DEF_OP_REG
-    (
-    /* op_name    */ SLICE,
-    /* init       */ op_init,
-    /* compute    */ op_compute,
-    /* deinit     */ op_deinit,
-    /* check      */ op_check,
-    /* setup      */ op_setup,
-    /* optimize   */ op_optimize,
-    /* input_num  */ 1,
-    /* output_num */ 1
-    );
+    /* Registrar */
+    DEF_OP_REG
+        (
+        /* op_name    */ SLICE,
+        /* init       */ op_init,
+        /* compute    */ op_compute,
+        /* deinit     */ op_deinit,
+        /* check      */ op_check,
+        /* setup      */ op_setup,
+        /* optimize   */ op_optimize,
+        /* input_num  */ 1,
+        /* output_num */ 1
+        );
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c
index 1d7b1b2..4d9d80e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c
@@ -35,7 +35,7 @@
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_link_list.h"
 
-#define MAX_SOFTMAX_BATCH 65535
+#define MAX_SOFTMAX_BATCH 65520
 
 static vsi_bool _need_split_softmax
     (
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c
index 3a2aea3..e0ef8c7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c
@@ -35,7 +35,7 @@
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_math.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "libnnext/vx_lib_nnext.h"
 #include "vsi_nn_test.h"
 #include "utils/vsi_nn_constraint_check.h"
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_spatial_transformer.c b/src/tim/vx/internal/src/ops/vsi_nn_op_spatial_transformer.c
index 86e3e4f..c514fbf 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_spatial_transformer.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_spatial_transformer.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "utils/vsi_nn_dtype_util.h"
 
 #define _ARG_NUM            (2)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_split.c b/src/tim/vx/internal/src/ops/vsi_nn_op_split.c
index 7fa6eee..831570f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_split.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_split.c
@@ -135,13 +135,31 @@ static vsi_bool op_check
     {
         BEGIN_IO_TYPE_DECL(SPLIT, 1, 1)
             IO_TYPE(D_F16,  D_F16)
+            IO_TYPE(D_F16,  D_I8|Q_DFP)
+            IO_TYPE(D_F16,  D_I16|Q_DFP)
+            IO_TYPE(D_F16,  D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_DFP,  D_F16)
+            IO_TYPE(D_I16|Q_DFP, D_F16)
+            IO_TYPE(D_U8|Q_ASYM, D_F16)
             IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP)
             IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
             IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
+            IO_TYPE(D_F16,  D_I8)
+            IO_TYPE(D_F16,  D_I16)
+            IO_TYPE(D_F16,  D_U8)
+            IO_TYPE(D_I8,   D_F16)
+            IO_TYPE(D_I16,  D_F16)
+            IO_TYPE(D_U8,   D_F16)
+            IO_TYPE(D_I8,   D_I8)
+            IO_TYPE(D_I16,  D_I16)
+            IO_TYPE(D_U8,   D_U8)
             IO_TYPE(D_F32,  D_F32)
             IO_TYPE(D_F32,  D_BF16)
             IO_TYPE(D_BF16, D_F32)
             IO_TYPE(D_I32,  D_I32)
+
+            /* HW 9.0 */
+            IO_TYPE(D_BF16, D_BF16)
         END_IO_TYPE_DECL(SPLIT)
         if(!VALIDATE_OP_IO_TYPES(SPLIT, self, inputs, 1, &outputs[i], 1)) {
             char* desc = generate_op_io_types_desc(inputs, 1, &outputs[i], 1);
@@ -197,18 +215,17 @@ static vsi_bool op_setup
         end[i] = inputs[0]->attr.size[i];
     }
     end[axis] = 0;
-    for(i = 0; i < num; i++)
+    for (i = 0; i < num; i++)
     {
-        int j;
+        int32_t j;
         start[axis] = end[axis];
-        if(slices_num == 0)
+        if (slices_num == 0)
             end[axis] += average;
         else
             end[axis] += slices[i];
 
-        memcpy(&outputs[i]->attr.dtype, &inputs[0]->attr.dtype, sizeof(vsi_nn_dtype_t));
         outputs[i]->attr.dim_num = inputs[0]->attr.dim_num;
-        for(j = 0; j < VSI_NN_MAX_DIM_NUM; j++)
+        for (j = 0; j < VSI_NN_MAX_DIM_NUM; j++)
         {
             outputs[i]->attr.size[j] = inputs[0]->attr.size[j];
         }
@@ -225,6 +242,7 @@ static vsi_bool op_setup
         curr->node->nn_param.strided_slice.begin_mask = 0;
         curr->node->nn_param.strided_slice.end_mask = 0;
         curr->node->nn_param.strided_slice.shrink_axis_mask = 0;
+        curr->node->nn_param.strided_slice.new_axis_mask = 0;
         curr->inputs[0] = inputs[0];
         curr->outputs[0] = outputs[i];
         vsi_nn_internal_setup_node( self, curr );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c b/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c
index 84e3481..7f47cfa 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c
@@ -37,7 +37,7 @@
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_link_list.h"
 #include "utils/vsi_nn_dtype_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 #define _ARG_NUM            (1)
 #define _INPUT_NUM          VSI_NN_STACK_MAX_INPUTS
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
index fa32a0c..0d84833 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
@@ -33,6 +33,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
+#include "vsi_nn_test.h"
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_constraint_check.h"
@@ -106,12 +107,13 @@ static vsi_bool _get_stride_slice_start_stop_stride
     vsi_nn_tensor_t ** outputs
     )
 {
-    vx_uint32 i = 0;
-    vx_int32 int32_value = 0;
+    int32_t i = 0;
+    int32_t int32_value = 0;
     vsi_nn_strided_slice_param *p = &(self->nn_param.strided_slice);
     int32_t *start = p->lcl2_data->begin_dims;
     int32_t *stop = p->lcl2_data->end_dims;
     int32_t *stride = p->lcl2_data->stride_dims;
+    strided_slice_param* params = &p->lcl2_data->params;
 
     for (i = 0; i < VSI_NN_MAX_DIM_NUM; i ++)
     {
@@ -120,36 +122,36 @@ static vsi_bool _get_stride_slice_start_stop_stride
         stride[i]   = 1;
     }
 
-    for (i = 0; i < p->stride_dims_num; ++i)
+    for (i = 0; i < params->stride_dims_num; ++i)
     {
-        stride[i] = p->stride_dims[i];
+        stride[i] = params->stride_dims[i];
     }
 
-    for (i = 0; i < p->begin_dims_num; ++i)
+    for (i = 0; i < params->begin_dims_num; ++i)
     {
-        int32_value = p->begin_dims[i];
+        int32_value = params->begin_dims[i];
 
         start[i] = get_slice_axis_value(int32_value, inputs[0]->attr.size[i]);
     }
 
-    for (i = 0; i < p->end_dims_num; ++i)
+    for (i = 0; i < params->end_dims_num; ++i)
     {
-        int32_value = p->end_dims[i];
+        int32_value = params->end_dims[i];
 
         stop[i] = get_slice_axis_value(int32_value, inputs[0]->attr.size[i]);
     }
 
     /*if the ith bit of mask is set, the start or stop will be the fullest possible range in that dimension.*/
-    for (i = 0; i < inputs[0]->attr.dim_num; i ++)
+    for (i = 0; i < (int32_t)inputs[0]->attr.dim_num; i ++)
     {
-        if (p->begin_mask & (1 << i))
+        if (params->begin_mask & (1 << i))
         {
             start[i] = get_slice_mask_start_value(stride[i], inputs[0]->attr.size[i]);
         }
 
         start[i] = vsi_nn_clamp(start[i], 0, (vx_int32)(inputs[0]->attr.size[i] - 1));
 
-        if (p->shrink_axis_mask & (1 << i))
+        if (params->shrink_axis_mask & (1 << i))
         {
             stop[i] = start[i] + 1;
         }
@@ -163,7 +165,7 @@ static vsi_bool _get_stride_slice_start_stop_stride
     }
 
     /* reset start stop and stride when output size is 1*/
-    for (i = 0; i < outputs[0]->attr.dim_num; i ++)
+    for (i = 0; i < (int32_t)outputs[0]->attr.dim_num; i ++)
     {
         if (outputs[0]->attr.size[i] == 1 && stride[i] < 0)
         {
@@ -174,12 +176,12 @@ static vsi_bool _get_stride_slice_start_stop_stride
 
     if (_check_neg_start_end_dims(start, stop, inputs[0]->attr.dim_num))
     {
-        memcpy(start, p->begin_dims, sizeof(int32_t) * p->begin_dims_num);
-        memcpy(stop, p->end_dims, sizeof(int32_t) * p->end_dims_num);
-        memcpy(stride, p->stride_dims, sizeof(int32_t) * p->stride_dims_num);
-        p->lcl2_data->begin_mask = p->begin_mask;
-        p->lcl2_data->end_mask = p->end_mask;
-        p->lcl2_data->shrink_axis_mask = p->shrink_axis_mask;
+        memcpy(start, params->begin_dims, sizeof(int32_t) * params->begin_dims_num);
+        memcpy(stop, params->end_dims, sizeof(int32_t) * params->end_dims_num);
+        memcpy(stride, params->stride_dims, sizeof(int32_t) * params->stride_dims_num);
+        p->lcl2_data->begin_mask = params->begin_mask;
+        p->lcl2_data->end_mask = params->end_mask;
+        p->lcl2_data->shrink_axis_mask = params->shrink_axis_mask;
     }
 
     return TRUE;
@@ -276,6 +278,7 @@ static vsi_status op_compute
     int32_t   *stop_dims = NULL;
     int32_t   *stride_dims = NULL;
     vsi_nn_strided_slice_lcl_data2 * p = self->nn_param.strided_slice.lcl2_data;
+    strided_slice_param* params = &p->params;
 
     start_dims = p->begin_dims;
     stop_dims = p->end_dims;
@@ -301,12 +304,12 @@ static vsi_status op_compute
     {
         uint32_t sizes[VSI_NN_MAX_DIM_NUM] = {1};
         uint32_t dims = inputs[0]->attr.dim_num;
-        int32_t  shrink_axis_mask = self->nn_param.strided_slice.shrink_axis_mask;
+        int32_t  shrink_axis_mask = params->shrink_axis_mask;
 
         memset(&param, 0, sizeof(vx_nn_stride_slice_params_t));
 
         memset(&attr, 0, sizeof(attr));
-        attr.size[0] = self->nn_param.strided_slice.begin_dims_num;
+        attr.size[0] = params->begin_dims_num;
         attr.dim_num = 1;
         attr.is_const = TRUE;
         attr.dtype.vx_type = VSI_NN_TYPE_INT32;
@@ -325,7 +328,7 @@ static vsi_status op_compute
         param.begin_dims = REQUIRED_IO(begin_dims_tensor);
 
         memset(&attr, 0, sizeof(attr));
-        attr.size[0] = self->nn_param.strided_slice.end_dims_num;
+        attr.size[0] = params->end_dims_num;
         attr.dim_num = 1;
         attr.is_const = TRUE;
         attr.dtype.vx_type = VSI_NN_TYPE_INT32;
@@ -344,7 +347,7 @@ static vsi_status op_compute
         param.end_dims = REQUIRED_IO(end_dims_tensor);
 
         memset(&attr, 0, sizeof(attr));
-        attr.size[0] = self->nn_param.strided_slice.stride_dims_num;
+        attr.size[0] = params->stride_dims_num;
         attr.dim_num = 1;
         attr.is_const = TRUE;
         attr.dtype.vx_type = VSI_NN_TYPE_INT32;
@@ -461,6 +464,77 @@ static vsi_bool op_check
     return TRUE;
 } /* op_check() */
 
+static vsi_bool _build_strided_slice_params(vsi_nn_strided_slice_param * op_params, int32_t input_dims)
+{
+    uint32_t i = 0;
+    int32_t num_add_axis = 0;
+    int32_t added_ellipsis = 0;
+    int32_t begin_mask = op_params->begin_mask;
+    int32_t end_mask = op_params->end_mask;
+    int32_t shrink_axis_mask = op_params->shrink_axis_mask;
+    const int32_t *begin_dims = op_params->begin_dims;
+    const int32_t *end_dims = op_params->end_dims;
+    const int32_t *stride_dims = op_params->stride_dims;
+    strided_slice_param *params = &op_params->lcl2_data->params;
+
+    for (i = 0; i < op_params->begin_dims_num; i++)
+    {
+        if ( op_params->new_axis_mask & (1 << i))
+        {
+            num_add_axis ++;
+        }
+    }
+
+    params->num_add_axis = num_add_axis;
+
+    for (i = 0; i < (uint32_t)(input_dims + num_add_axis); i++)
+    {
+        if ( op_params->new_axis_mask & (1 << i) )
+        {
+            continue;
+        }
+        else if (i >= op_params->begin_dims_num + added_ellipsis)
+        {
+            params->begin_mask |= (1 << params->begin_dims_num);
+            params->end_mask |= (1 << params->end_dims_num);
+            params->begin_dims[params->begin_dims_num ++ ] =
+                0;
+            params->end_dims[params->end_dims_num ++] =
+                0;
+            params->stride_dims[params->stride_dims_num ++] =
+                1;
+        }
+        else
+        {
+            int32_t orig_idx = i - added_ellipsis;
+
+            if (begin_mask & (1 << orig_idx))
+            {
+                params->begin_mask |= (1 << params->begin_dims_num);
+            }
+
+            if (end_mask & (1 << orig_idx))
+            {
+                params->end_mask |= (1 << params->end_dims_num);
+            }
+
+            if (shrink_axis_mask & (1 << orig_idx))
+            {
+                params->shrink_axis_mask |= (1 << params->begin_dims_num);
+            }
+
+            params->begin_dims[params->begin_dims_num ++] =
+                begin_dims[orig_idx];
+            params->end_dims[params->end_dims_num ++] =
+                end_dims[orig_idx];
+            params->stride_dims[params->stride_dims_num ++] =
+                stride_dims[orig_idx];
+        }
+    }
+
+    return TRUE;
+}
+
 static vsi_bool op_setup
     (
     vsi_nn_node_t * self,
@@ -468,18 +542,26 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
-    if(self->nn_param.strided_slice.begin_dims_num == 0)
+    uint32_t i = 0;
+    vsi_nn_strided_slice_param *p = &(self->nn_param.strided_slice);
+    strided_slice_param *params = &p->lcl2_data->params;
+
+    if ( vsi_nn_compareVersion(self->graph, 1, 1, 32) == -1
+        && self->nn_param.strided_slice.begin_dims_num == 0)
     {
         self->nn_param.strided_slice.begin_dims_num = inputs[0]->attr.dim_num;
         self->nn_param.strided_slice.end_dims_num = inputs[0]->attr.dim_num;
         self->nn_param.strided_slice.stride_dims_num = inputs[0]->attr.dim_num;
     }
+
+    _build_strided_slice_params(p, inputs[0]->attr.dim_num);
+
     /* TODO: Add code to comput outputs' shape. */
 
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
-        vsi_nn_strided_slice_param *p = &(self->nn_param.strided_slice);
-        vx_uint32 i;
+        int32_t idx = 0;
+
         for (i = 0; i < inputs[0]->attr.dim_num; i++)
         {
             vx_int32 begin = 0, end = 1, stride = 1;
@@ -487,20 +569,20 @@ static vsi_bool op_setup
             vx_int32 output_size = 0;
             vx_int32 j;
 
-            begin = get_slice_axis_value(p->begin_dims[i], input_size);
-            end = get_slice_axis_value(p->end_dims[i], input_size);
-            stride = p->stride_dims[i];
-            if (p->begin_mask & (1 << i))
+            begin = get_slice_axis_value(params->begin_dims[i], input_size);
+            end = get_slice_axis_value(params->end_dims[i], input_size);
+            stride = params->stride_dims[i];
+            if (params->begin_mask & (1 << i))
             {
                 begin = get_slice_mask_start_value(stride, input_size);
             }
             begin = vsi_nn_clamp(begin, 0, (vx_int32)(input_size - 1));
-            if (p->shrink_axis_mask & (1 << i))
+            if (params->shrink_axis_mask & (1 << i))
             {
                 end = begin + 1;
             }
 
-            if (p->end_mask & (1 << i))
+            if (params->end_mask & (1 << i))
             {
                 end = get_slice_mask_stop_value(stride, input_size);
             }
@@ -512,11 +594,25 @@ static vsi_bool op_setup
             outputs[0]->attr.size[i] = output_size;
         }
         outputs[0]->attr.dim_num = 0;
-        for (i = 0; i < inputs[0]->attr.dim_num; i++)
+        for (idx = 0, i = 0; i < inputs[0]->attr.dim_num + params->num_add_axis; i++)
         {
-            if (p->shrink_axis_mask & (1 << i)) continue;
+            if (p->new_axis_mask & (1 << i))
+            {
+                outputs[0]->attr.size[outputs[0]->
+                    attr.dim_num] = 1;
+
+                outputs[0]->attr.dim_num++;
+                continue;
+            }
+            else if (params->shrink_axis_mask & (1 << idx))
+            {
+                idx ++;
+                continue;
+            }
+
             outputs[0]->attr.size[outputs[0]->
-                attr.dim_num] = outputs[0]->attr.size[i];
+                attr.dim_num] = outputs[0]->attr.size[idx ++];
+
             outputs[0]->attr.dim_num++;
         }
     }
@@ -600,14 +696,16 @@ static vsi_status op_deinit
     )
 {
     vsi_nn_strided_slice_lcl_data2 * lcl2_data;
-
+    strided_slice_param *params = NULL;
     if(NULL == self)
     {
         return VSI_FAILURE;
     }
 
     lcl2_data = self->nn_param.strided_slice.lcl2_data;
-    if(self->n)
+    params = &lcl2_data->params;
+
+    if (self->n)
     {
         if( NULL != self && NULL != self->n )
         {
@@ -616,6 +714,10 @@ static vsi_status op_deinit
         }
     }
 
+    vsi_nn_safe_free( params->begin_dims );
+    vsi_nn_safe_free( params->end_dims );
+    vsi_nn_safe_free( params->stride_dims );
+
     if (lcl2_data->cp_node)
     {
         vxReleaseNode( &lcl2_data->cp_node );
@@ -674,42 +776,74 @@ static vsi_status op_init
     )
 {
     vsi_status status = VSI_SUCCESS;
+    vsi_nn_strided_slice_lcl_data2 * lcl2_data = NULL;
+    strided_slice_param* params = NULL;
+
+    if (vsi_nn_compareVersion(self->graph, 1, 1, 32) == -1)
+    {
+        self->nn_param.strided_slice.new_axis_mask = 0;
+    }
 
     self->nn_param.strided_slice.lcl2_data =
-    (vsi_nn_strided_slice_lcl_data2 *)malloc(sizeof(vsi_nn_strided_slice_lcl_data2));
+        (vsi_nn_strided_slice_lcl_data2 *)malloc(sizeof(vsi_nn_strided_slice_lcl_data2));
     if (NULL == self->nn_param.strided_slice.lcl2_data)
     {
         return  VX_ERROR_NO_MEMORY;
     }
 
-    memset( self->nn_param.strided_slice.lcl2_data, 0, sizeof(vsi_nn_strided_slice_lcl_data2) );
+    lcl2_data = self->nn_param.strided_slice.lcl2_data;
 
-    self->nn_param.strided_slice.lcl2_data->begin_dims =
+    memset( lcl2_data, 0, sizeof(vsi_nn_strided_slice_lcl_data2) );
+
+    params = &lcl2_data->params;
+
+    lcl2_data->begin_dims =
         (int32_t *)malloc(sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
-    if (NULL == self->nn_param.strided_slice.lcl2_data->begin_dims)
+    if (NULL == lcl2_data->begin_dims)
     {
         return  VX_ERROR_NO_MEMORY;
     }
-    memset(self->nn_param.strided_slice.lcl2_data->begin_dims, 0,
-        sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
+    memset(lcl2_data->begin_dims, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
 
-    self->nn_param.strided_slice.lcl2_data->end_dims =
+    params->begin_dims =
         (int32_t *)malloc(sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
-    if (NULL == self->nn_param.strided_slice.lcl2_data->end_dims)
+    if (NULL == lcl2_data->begin_dims)
     {
         return  VX_ERROR_NO_MEMORY;
     }
-    memset(self->nn_param.strided_slice.lcl2_data->end_dims, 0,
-        sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
+    memset(params->begin_dims, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
 
-    self->nn_param.strided_slice.lcl2_data->stride_dims =
+    lcl2_data->end_dims =
         (int32_t *)malloc(sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
-    if (NULL == self->nn_param.strided_slice.lcl2_data->stride_dims)
+    if (NULL == lcl2_data->end_dims)
     {
         return  VX_ERROR_NO_MEMORY;
     }
-    memset(self->nn_param.strided_slice.lcl2_data->stride_dims, 0,
-        sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
+    memset(lcl2_data->end_dims, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
+
+    params->end_dims =
+        (int32_t *)malloc(sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
+    if (NULL == params->end_dims)
+    {
+        return  VX_ERROR_NO_MEMORY;
+    }
+    memset(params->end_dims, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
+
+    lcl2_data->stride_dims =
+        (int32_t *)malloc(sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
+    if (NULL == lcl2_data->stride_dims)
+    {
+        return  VX_ERROR_NO_MEMORY;
+    }
+    memset(lcl2_data->stride_dims, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
+
+    params->stride_dims =
+        (int32_t *)malloc(sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
+    if (NULL == params->stride_dims)
+    {
+        return  VX_ERROR_NO_MEMORY;
+    }
+    memset(params->stride_dims, 0, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
 
     return status;
 } /* op_init() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c b/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c
index 94fa617..642975e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c
@@ -163,17 +163,17 @@ static vsi_bool op_check
         ret = FALSE;
     }
 
-    if(ret)
+    if (ret)
     {
         BEGIN_IO_TYPE_DECL(SVDF, 5, 2)
-            IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F16, D_F16, D_F16)
-            IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_F16, D_F16)
-            IO_TYPE(D_F32, D_F16, D_F16, D_F16, D_F32, D_F32, D_F16)
-            IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32, D_F32)
-            IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F16, D_F16, D_NONE)
-            IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32, D_F16, D_NONE)
-            IO_TYPE(D_F32, D_F16, D_F16, D_F16, D_F32, D_F32, D_NONE)
-            IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32, D_NONE)
+            IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F16,  D_F16, D_F16)
+            IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_F32,  D_F16, D_F16)
+            IO_TYPE(D_F32, D_F16, D_F16, D_F16, D_F32,  D_F32, D_F16)
+            IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32,  D_F32, D_F32)
+            IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_NONE, D_F16, D_F16)
+            IO_TYPE(D_F16, D_F16, D_F16, D_F16, D_NONE, D_F32, D_F16)
+            IO_TYPE(D_F32, D_F16, D_F16, D_F16, D_NONE, D_F32, D_F32)
+            IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_NONE, D_F32, D_F32)
         END_IO_TYPE_DECL(SVDF)
         if(!VALIDATE_OP_IO_TYPES(SVDF, self, inputs, self->input.num, outputs, self->output.num)) {
             char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c b/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c
index cc46ab0..957ecd5 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_sync_host.c
@@ -25,7 +25,7 @@
 #include <stdlib.h>
 
 #include "vsi_nn_pub.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 
 #define _ARG_NUM            (0)
 #define _INPUT_NUM          (1)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tensor_add_mean_stddev_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tensor_add_mean_stddev_norm.c
index 0906eab..78f3508 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_tensor_add_mean_stddev_norm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tensor_add_mean_stddev_norm.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_constraint_check.h"
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c
index 39d32a5..f752f1e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
 #define _ARG_NUM            (0)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
index 56d056d..09343e7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
@@ -34,158 +34,10 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "kernel/vsi_nn_kernel.h"
 
-#define _ARG_NUM            (1)
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (2)
-#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
-#define _PARAM_NUM          (_ARG_NUM + _IO_NUM)
-
-extern vx_kernel_description_t * vx_kernel_TOPK_list[];
-
-static void _set_inputs_outputs
-    (
-    vx_reference * params,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    uint32_t i;
-    uint32_t cnt;
-
-    /* Set inputs */
-    cnt = 0;
-    for( i = 0; i < _INPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)inputs[i]->t;
-    }
-
-    /* Set outputs */
-    for( i = 0; i < _OUTPUT_NUM; i ++, cnt ++ )
-    {
-        params[cnt] = (vx_reference)outputs[i]->t;
-    }
-} /* _set_inputs_outputs() */
-
-static vsi_status _create_params
-    (
-    vsi_nn_node_t * node,
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    vsi_status status;
-    vx_context ctx;
-    vsi_nn_topk_param * p;
-    if( 0 == num )
-    {
-        return VSI_SUCCESS;
-    }
-    memset( params, 0, sizeof( vx_reference * ) * num );
-    p = &(node->nn_param.topk);
-    ctx = vxGetContext( (vx_reference)node->graph->g );
-    /* Init parameters */
-    #define _SET_PARAM( i, type, arg ) do{ \
-        params[i] = (vx_reference)vxCreateScalar( ctx, type, &p->arg ); \
-        status = vxGetStatus( params[i] ); \
-        if( VSI_SUCCESS != status ) { \
-            goto set_param_error; \
-            } \
-        } while(0)
-    _SET_PARAM( 0, VX_TYPE_UINT32, k );
-    #undef _SET_PARAM
-set_param_error:
-
-    return status;
-} /* _create_params */
-
-static void _release_params
-    (
-    vx_reference * params,
-    uint32_t num
-    )
-{
-    uint32_t i;
-    vx_scalar scalar;
-    for( i = 0; i < num; i ++ )
-    {
-        scalar = (vx_scalar)params[i];
-        vxReleaseScalar( &scalar );
-    }
-} /* _release_params() */
-
-static vsi_status cpu_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_reference * args;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    return status;
-}
-
-static vsi_status vx_op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-    vx_reference params[_PARAM_NUM];
-    vx_reference * args;
-
-    args = &params[_IO_NUM];
-
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-
-    /* Set inputs and outputs */
-    _set_inputs_outputs( params, inputs, outputs );
-    /*TODO: Add code if need to change your parameter*/
-
-    /* Init parameters. */
-    _create_params( self, args, _ARG_NUM );
-
-    /* Pass parameters to node. */
-    status = vsi_nn_ClientNodePassParameters( self->n, params, _PARAM_NUM );
-
-    _release_params( args, _ARG_NUM );
-
-    return status;
-}
-
-static vsi_nn_op_compute_t op_compute_list[] =
-{
-    cpu_op_compute,
-    vx_op_compute,
-    NULL
-};
 
 static vsi_status op_compute
     (
@@ -194,46 +46,20 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status;
-    vsi_nn_kernel_info_t kernel_info;
-    char *path = NULL;
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t * param = NULL;
 
-    memset(&kernel_info, 0x0, sizeof(vsi_nn_kernel_info_t));
-    status = VSI_FAILURE;
-    kernel_info.type = VX_KERNEL_TYPE_CPU;
-    kernel_info.kernel = vx_kernel_TOPK_list;
-    kernel_info.resource_num = 1;
-    kernel_info.resource_name = (char **)malloc(kernel_info.resource_num * sizeof(char *));
-    kernel_info.resource_name[0] = "vsi_nn_kernel_topk";
-    path = getenv("USER_VX_SOURCE_PATH");
-    if(path)
-        vsi_nn_VxResourceSetPath(path);
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_int32( param, "top_k", self->nn_param.topk.k );
 
-    if( kernel_info.type == VX_KERNEL_TYPE_VX)
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "topk",
+        inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
+
+    if( self->n )
     {
-        kernel_info.kernel_index = 1;
-        kernel_info.init_index = 1;
-    }
-    else /*kernel_info.type = VX_KERNEL_TYPE_CPU;*/
-    {
-        kernel_info.kernel_index = 0;
-        kernel_info.init_index = 0;
+        status = VSI_SUCCESS;
     }
 
-    self->n = vsi_nn_RegisterClientKernelAndNewNode(
-            self->graph, &kernel_info);
-    if (kernel_info.resource_name)
-    {
-        free(kernel_info.resource_name);
-    }
-    if( NULL == self->n )
-    {
-        return VSI_FAILURE;
-    }
-    if (NULL != op_compute_list[kernel_info.init_index])
-    {
-        status = op_compute_list[kernel_info.init_index](self, inputs, outputs);
-    }
     return status;
 } /* op_compute() */
 
@@ -269,6 +95,14 @@ static vsi_bool op_setup
         {
             outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
         }
+    }
+
+    if( VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num )
+    {
+        vsi_nn_topk_param * p;
+
+        p = &(self->nn_param.topk);
+
         outputs[1]->attr.dim_num = inputs[0]->attr.dim_num;
         outputs[1]->attr.size[0] = p->k;
         for (i = 1; i < inputs[0]->attr.dim_num; i++)
@@ -276,6 +110,7 @@ static vsi_bool op_setup
             outputs[1]->attr.size[i] = inputs[0]->attr.size[i];
         }
     }
+
     return TRUE;
 } /* op_setup() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c
index febe9e3..311e433 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "vsi_nn_internal_node.h"
 #include "vsi_nn_rnn_helper.h"
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c
index 5717fe3..499cdd7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "vsi_nn_internal_node.h"
 
 #define _INPUT_NUM          (1)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c b/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c
index 6dd771f..dbe5ff8 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
 #include "ops/vsi_nn_op_upsample.h"
-#include "client/vsi_nn_vxkernel.h"
+#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
 #include "utils/vsi_nn_constraint_check.h"
 
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
index 16c1bff..8c07eb6 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
@@ -419,6 +419,15 @@ static _op_param_gen_t s_op_gen[] =
     /* INTERP */                NULL,
     /* RESIZE_1D */             NULL,
     /* UPSAMPLESCALE */         NULL,
+    /* GROUPNORM */             NULL,
+    /* ROUND */                 NULL,
+    /* CEIL */                  NULL,
+    /* SEQUENCE_MASK */         NULL,
+    /* REPEAT */                NULL,
+    /* ERF */                   NULL,
+    /* ONE_HOT */               NULL,
+    /* NMS */                   NULL,
+    /* GROUPED_CONV1D */        NULL,
 };
 _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );
 
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c b/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c
index 845a790..36060ea 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c
@@ -158,6 +158,8 @@ vsi_bool validate_op_io_types
 
     if(self && self->attr.enable_op_constraint_check) {
         uint32_t i = 0;
+        int32_t j = 0;
+        int32_t reg_tensor_num = op_constraint_reg->reg_input_num + op_constraint_reg->reg_output_num;
 
         node_io_signature_t* sig = _get_op_signature(inputs, inputs_num,
                 outputs, outputs_num, op_constraint_reg);
@@ -167,7 +169,22 @@ vsi_bool validate_op_io_types
             for(i = 0; i < op_constraint_reg->io_types_item_count; i++) {
                 const uint8_t* curr = ((const uint8_t*)op_constraint_reg->types) \
                         + op_constraint_reg->io_types_item_size * i;
-                if(!memcmp(curr, sig->types, op_constraint_reg->io_types_item_size)) {
+                vsi_nn_type_e *curr_type = (vsi_nn_type_e *)curr;
+
+                for (j = 0; j < reg_tensor_num; j++)
+                {
+                    vsi_nn_type_e qnt_type = sig->types[j] >> Q_SHIFT;
+                    vsi_nn_type_e data_type = sig->types[j] & ((1 << Q_SHIFT) - 1);
+                    vsi_nn_type_e curr_qnt_type = curr_type[j] >> Q_SHIFT;
+                    vsi_nn_type_e curr_data_type = curr_type[j] & ((1 << Q_SHIFT) - 1);
+                   if ( (qnt_type != (vsi_nn_type_e)VSI_NN_QNT_TYPE_NONE && qnt_type != curr_qnt_type) ||
+                       data_type != curr_data_type )
+                   {
+                       break;
+                   }
+                }
+                if (j == reg_tensor_num)
+                {
                     matched = TRUE;
                     break;
                 }
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
index 75f686c..e80ef51 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
@@ -399,6 +399,9 @@ vsi_bool vsi_nn_dtype_convert_quantize_asymm_to_float
         case U8:
             return vsi_nn_dtype_convert_quantize_asymm8_to_float(
                     (const uint8_t *)buffer, size, scale, zero_point, out_buffer );
+        case I32:
+            return vsi_nn_dtype_convert_quantize_symm32_to_float(
+                    (const int *)buffer, size, scale, zero_point, out_buffer );
         default:
             VSILOGE("Don't support convert asymm quant %d to float.", dtype);
             break;
diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c
index a49d8f8..c94a1ca 100644
--- a/src/tim/vx/internal/src/vsi_nn_graph.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph.c
@@ -41,7 +41,7 @@
 #include "utils/vsi_nn_map.h"
 #include "vsi_nn_graph_optimization.h"
 
-static vsi_status _set_reference_name
+static vsi_status _set_reference_node_name
     (
     vsi_nn_graph_t *graph,
     vsi_nn_node_t *node
@@ -49,10 +49,7 @@ static vsi_status _set_reference_name
 {
 #define _NODE_ID_LEN 64
     vsi_status status;
-    vsi_nn_tensor_t *tensor;
-    uint32_t i;
     char name[_NODE_ID_LEN];
-
     if(NULL == node || NULL == graph)
     {
         return VSI_FAILURE;
@@ -66,6 +63,28 @@ static vsi_status _set_reference_name
         status = vxSetReferenceName((vx_reference)node->n, name);
     }
     TEST_CHECK_STATUS(status, final);
+
+final:
+    return status;
+} /* _set_reference_node_name() */
+
+static vsi_status _set_reference_tensor_name
+    (
+    vsi_nn_graph_t *graph,
+    vsi_nn_node_t *node
+    )
+{
+#define _NODE_ID_LEN 64
+    vsi_status status;
+    vsi_nn_tensor_t *tensor;
+    uint32_t i;
+    char name[_NODE_ID_LEN];
+    if(NULL == node || NULL == graph)
+    {
+        return VSI_FAILURE;
+    }
+
+    status = VSI_SUCCESS;
     for(i = 0; i < node->output.num; i++)
     {
         memset(name, 0, sizeof(char) * _NODE_ID_LEN);
@@ -80,7 +99,7 @@ static vsi_status _set_reference_name
 
 final:
     return status;
-} /* _set_reference_name() */
+} /* _set_reference_tensor_name() */
 
 static vsi_status _check_swapped_tensors
     (
@@ -345,6 +364,12 @@ static vsi_status compute_node
                 continue;
             vsi_nn_TensorReinit( graph, outputs[j] );
         }
+        status = _set_reference_tensor_name(graph, node);
+        if( VSI_SUCCESS != status )
+        {
+            VSILOGW("Set reference node[%d] %s output tensor name fail",
+                node_id, vsi_nn_OpGetName(node->op));
+        }
 
         /* Create vx node */
         VSILOGD("Instance node[%d] \"%s\" ...", node_id, vsi_nn_OpGetName(node->op));
@@ -354,7 +379,7 @@ static vsi_status compute_node
             VSILOGE( "Create node[%d] %s fail", node_id, vsi_nn_OpGetName(node->op));
             break;
         }
-        status = _set_reference_name(graph, node);
+        status = _set_reference_node_name(graph, node);
         if( VSI_SUCCESS != status )
         {
             VSILOGW("Set reference name fail");
@@ -465,6 +490,65 @@ final:
     return status;
 } /* setup_node() */
 
+static vsi_status set_graph_precision
+    (
+    vsi_nn_graph_t * graph,
+    vsi_nn_node_id_t *node_list
+    )
+{
+    uint32_t i, j;
+    vsi_status status;
+    vsi_nn_tensor_t **inputs;
+    vsi_nn_tensor_t **outputs;
+    vsi_nn_node_id_t node_id;
+    vsi_nn_node_t   *node;
+
+    status = VSI_SUCCESS;
+    inputs = allocate_io_buffer(graph);
+    outputs = allocate_io_buffer(graph);
+    if(NULL == inputs || NULL == outputs)
+    {
+        VSILOGE("allocate io buffer fail");
+        status =  VSI_FAILURE;
+        goto final;
+    }
+
+    if(vsi_nn_IsGraphFastMode(graph))
+    {
+        goto final;
+    }
+    for( i = 0; i < graph->node_num; i++ )
+    {
+        node_id = node_list[i];
+        memset( inputs, 0, graph->max_node_io * sizeof( vsi_nn_tensor_t * ) );
+        memset( outputs, 0, graph->max_node_io * sizeof( vsi_nn_tensor_t * ) );
+        /* Get inputs, outputs. */
+        node = vsi_nn_GetNode( graph, node_id );
+        vsi_nn_GetTensors( graph, node->input.tensors,
+            node->input.num, inputs );
+        vsi_nn_GetTensors( graph, node->output.tensors,
+            node->output.num, outputs );
+
+        for(j = 0; j < node->input.num; j++)
+        {
+            if(inputs[j] != NULL && inputs[j]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32)
+            {
+                vsi_nn_SetTensorAttr(inputs[j], VSI_NN_TENSOR_ATTR_HIGH_PRECISION);
+            }
+        }
+        for(j = 0; j < node->output.num; j++)
+        {
+            if(outputs[j] != NULL && outputs[j]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32)
+            {
+                vsi_nn_SetTensorAttr(outputs[j], VSI_NN_TENSOR_ATTR_HIGH_PRECISION);
+            }
+        }
+    }
+final:
+    free_io_buffer(inputs);
+    free_io_buffer(outputs);
+    return status;
+}
 vsi_nn_graph_t * vsi_nn_CreateGraph
     (
     vsi_nn_context_t ctx,
@@ -507,6 +591,7 @@ vsi_nn_graph_t * vsi_nn_CreateGraph
             graph->rnn_wksp = NULL;
             graph->node_table = (vsi_nn_map_t *)malloc( sizeof( vsi_nn_map_t ) );
             graph->tensor_table = (vsi_nn_map_t *)malloc( sizeof( vsi_nn_map_t ) );
+            graph->isAllowFastMode = TRUE;
             vsi_nn_MapInit( graph->node_table );
             vsi_nn_MapInit( graph->tensor_table );
         }
@@ -532,6 +617,18 @@ void vsi_nn_ReleaseGraph
     ptr = *graph;
     if( NULL != graph && NULL != * graph )
     {
+        if( NULL != ptr->nodes )
+        {
+            for( i = 0; i < ptr->node_num; i++ )
+            {
+                vsi_nn_RemoveNode( *graph, (vsi_nn_node_id_t)i );
+            }
+            free( (*graph)->node_table );
+        }
+        if( NULL != ptr->g )
+        {
+            vxReleaseGraph( &ptr->g );
+        }
         if( NULL != ptr->tensors )
         {
             for( i = 0; i < ptr->tensor_num; i++ )
@@ -545,14 +642,6 @@ void vsi_nn_ReleaseGraph
         {
             vsi_nn_ReleaseTensor( &ptr->complete_signal.tensor );
         }
-        if( NULL != ptr->nodes )
-        {
-            for( i = 0; i < ptr->node_num; i++ )
-            {
-                vsi_nn_RemoveNode( *graph, (vsi_nn_node_id_t)i );
-            }
-            free( (*graph)->node_table );
-        }
         if( NULL != ptr->input.tensors )
         {
             free( ptr->input.tensors );
@@ -565,10 +654,6 @@ void vsi_nn_ReleaseGraph
         {
             vsi_nn_rnn_DeinitWksp( ptr );
         }
-        if( NULL != ptr->g )
-        {
-            vxReleaseGraph( &ptr->g );
-        }
         free( ptr );
         *graph = NULL;
     }
@@ -661,6 +746,12 @@ vsi_status vsi_nn_SetupGraph
         goto final;
     }
 
+    /* Set all of tensor attribute in graph to high precision */
+    status = set_graph_precision(graph, nodes_list);
+    if(VSI_SUCCESS != status)
+    {
+        goto final;
+    }
     /* Try setup graph complete signal node. */
     status = vsi_nn_TrySetupCompleteSignalNode( graph );
     TEST_CHECK_STATUS( status, final );
@@ -1369,7 +1460,7 @@ void vsi_nn_DumpGraphNodeOutputsEx
 #define _SHAPE_BUF_SZ   (64)
     char shape[_SHAPE_BUF_SZ] = { 0 };
     char filename[_MAX_TENSOR_NAME_SZ] = { 0 };
-    char filename_prefix[_SHAPE_BUF_SZ] = { 0 };
+    char filename_prefix[_SHAPE_BUF_SZ + 1] = { 0 };
     const char * op_name;
     uint32_t i;
     uint32_t o;
@@ -1998,3 +2089,29 @@ vsi_status vsi_nn_SetGraphPriority
 #endif
     return status;
 }
+
+vsi_status vsi_nn_SetGraphFastMode
+    (
+    vsi_nn_graph_t* graph,
+    vsi_bool fastmode
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    if(graph)
+    {
+        graph->isAllowFastMode = fastmode;
+    }
+    else
+    {
+        status = VSI_FAILURE;
+    }
+    return status;
+}
+
+vsi_bool vsi_nn_IsGraphFastMode
+    (
+    const vsi_nn_graph_t* graph
+    )
+{
+    return NULL == graph ? FALSE : graph->isAllowFastMode;
+}
diff --git a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
index 1cde801..4cdbd82 100644
--- a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
@@ -807,10 +807,25 @@ vsi_status vsi_nn_OptimizeGraph
     vsi_bool *dirty
     )
 {
-    vsi_status status = VSI_FAILURE;
+    vsi_status status = VSI_SUCCESS;
+    uint32_t i = 0;
+    vsi_bool nbg_flag = FALSE;
+    vsi_nn_node_t* node = NULL;
+    for(i = 0; i < graph->node_num; i++)
+    {
+        node = vsi_nn_GetNode(graph, i);
+        if(node->op == VSI_NN_OP_NBG)
+        {
+            nbg_flag = TRUE;
+            break;
+        }
+    }
 
-    status = _graph_optimization_convert_int8_to_uint8(graph, dirty);
-    TEST_CHECK_STATUS(status, final);
+    if (!nbg_flag)
+    {
+        status = _graph_optimization_convert_int8_to_uint8(graph, dirty);
+        TEST_CHECK_STATUS(status, final);
+    }
 
 final:
     return status;
diff --git a/src/tim/vx/internal/src/vsi_nn_internal_node.c b/src/tim/vx/internal/src/vsi_nn_internal_node.c
index 2ad5bc0..9c4485e 100644
--- a/src/tim/vx/internal/src/vsi_nn_internal_node.c
+++ b/src/tim/vx/internal/src/vsi_nn_internal_node.c
@@ -429,7 +429,8 @@ void vsi_nn_internal_init_tensor_attr
 
     if( dtype->qnt_type == VSI_NN_QNT_TYPE_NONE &&
         ( dtype->vx_type != VSI_NN_TYPE_FLOAT16 &&
-          dtype->vx_type != VSI_NN_TYPE_FLOAT32 ) )
+          dtype->vx_type != VSI_NN_TYPE_FLOAT32 &&
+          dtype->vx_type != VSI_NN_TYPE_BFLOAT16 ) )
     {
         attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
         attr->dtype.vx_type = VSI_NN_TYPE_FLOAT16;
diff --git a/src/tim/vx/internal/src/vsi_nn_node.c b/src/tim/vx/internal/src/vsi_nn_node.c
index 0c05bb2..98f05a7 100644
--- a/src/tim/vx/internal/src/vsi_nn_node.c
+++ b/src/tim/vx/internal/src/vsi_nn_node.c
@@ -166,7 +166,7 @@ void vsi_nn_PrintNode
     vsi_nn_node_id_t id
     )
 {
-#define _MAX_PRINT_BUF_SZ   (256)
+#define _MAX_PRINT_BUF_SZ   (1024)
     uint32_t i;
     int count;
     char buf[_MAX_PRINT_BUF_SZ];
diff --git a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
index 5c5ffd6..6743605 100644
--- a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
+++ b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
@@ -186,6 +186,9 @@ static _node_template s_template[] =
     /* PRE_PROCESS_NV12 */      NULL,
     /* SCATTER_ND */            NULL,
     /* DECONVOLUTION1D */       NULL,
+    /* GROUPNORM */             NULL,
+    /* SEQUENCE_MASK */         NULL,
+    /* REPEAT */                NULL,
 };
 //_compiler_assert( _cnt_of_array(s_template) == VSI_NN_OP_NUM, vsi_nn_node_attr_template_c );
 
diff --git a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
index c7c49a6..a97bd7f 100644
--- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
+++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
@@ -508,6 +508,7 @@ vsi_status vsi_nn_add_single_postproc_node
     )
 {
     vsi_nn_node_t* node;
+    vsi_nn_node_t** consume_nodes = NULL;
     vsi_nn_process_permute_t* permute = NULL;
     vsi_nn_tensor_t* org_norm_tensor = NULL;
     vsi_nn_tensor_attr_t  input_attr;
@@ -515,8 +516,10 @@ vsi_status vsi_nn_add_single_postproc_node
     vsi_nn_tensor_id_t postproc_input;
     vsi_nn_tensor_id_t postproc_output;
     vsi_nn_postprocess_dtype_convert_t* dtype_convert = NULL;
-    int32_t i = 0;
+    uint32_t i = 0;
+    uint32_t j = 0;
     int32_t idx = 0;
+    uint32_t nodes_count = 0;
     vsi_status status = VSI_SUCCESS;
 
     org_norm_tensor = vsi_nn_GetTensor(graph, graph->output.tensors[output_idx]);
@@ -561,10 +564,29 @@ vsi_status vsi_nn_add_single_postproc_node
     postproc_input = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &input_attr, NULL);
     postproc_output = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &output_attr, NULL);
 
+    /* Get origin norm tensor comsume nodes and connect its' comsume nodes */
+    vsi_nn_get_tensor_consumers(graph, graph->output.tensors[output_idx], NULL, &nodes_count);
+    if(nodes_count != 0)
+    {
+        consume_nodes = (vsi_nn_node_t**)malloc(sizeof(vsi_nn_node_t*)*nodes_count);
+        vsi_nn_get_tensor_consumers(graph, graph->output.tensors[output_idx], consume_nodes, NULL);
+        for(i = 0; i < nodes_count; i++)
+            {
+                for(j = 0; j < consume_nodes[i]->input.num; j++)
+                    {
+                        if(consume_nodes[i]->input.tensors[j] == graph->output.tensors[output_idx])
+                        {
+                            consume_nodes[i]->input.tensors[j] = postproc_input;
+                            break;
+                        }
+                    }
+            }
+    }
+
     /* Reconnect node tensors */
     node->input.tensors[0] = postproc_input;
     node->output.tensors[0] = postproc_output;
-    for(i = 0; i < (int32_t)last_node->output.num; i++)
+    for(i = 0; i < last_node->output.num; i++)
     {
         if(last_node->output.tensors[i] == graph->output.tensors[output_idx])
         {
@@ -574,7 +596,13 @@ vsi_status vsi_nn_add_single_postproc_node
     }
     graph->output.tensors[output_idx] = postproc_output;
 
+
 final:
+    if(consume_nodes)
+    {
+        free(consume_nodes);
+        consume_nodes = NULL;
+    }
     return status;
 } /* vsi_nn_add_single_postproc_node() */
 
diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c
index 0af8be5..a7bd3c7 100644
--- a/src/tim/vx/internal/src/vsi_nn_tensor.c
+++ b/src/tim/vx/internal/src/vsi_nn_tensor.c
@@ -575,20 +575,25 @@ vsi_nn_tensor_t * vsi_nn_CreateTensorWithDefault
         data = (uint8_t *)malloc( size );
         if( data )
         {
-            uint32_t i = 0;
+            uint32_t i = 0, j = 0;
             uint32_t elements = size / stride[0];
-            vsi_status status = VSI_SUCCESS;
+            vsi_status status = VSI_FAILURE;
 
-            for( i = 0; i < elements; i ++ )
+            status = vsi_nn_Float32ToDtype( defualt_value, &data[0], &t->attr.dtype );
+            if(stride[0] == 1)
             {
-                status = vsi_nn_Float32ToDtype( defualt_value, &data[stride[0] * i], &t->attr.dtype );
-                if( VSI_FAILURE == status )
+                 memset(data, data[0], size);
+            }
+            else
+            {
+                for( i = 1; i < elements; i ++ )
                 {
-                    VSILOGE("Convert default_value to dtype fail");
-                    break;
+                    for(j=0;j<stride[0];j++)
+                    {
+                        data[stride[0] * i + j] = data[j];
+                    }
                 }
             }
-
             status = vsi_nn_CopyDataToTensor( graph, t, data );
             free( data );
             data = NULL;
@@ -621,19 +626,24 @@ vsi_status vsi_nn_FillTensorWithValue
         data = (uint8_t *)malloc( size );
         if( data )
         {
-            uint32_t i = 0;
+            uint32_t i = 0, j = 0;
             uint32_t elements = size / stride[0];
+            status = vsi_nn_Float32ToDtype( value, &data[0], &tensor->attr.dtype );
 
-            for( i = 0; i < elements; i ++ )
+            if(stride[0] == 1)
             {
-                status = vsi_nn_Float32ToDtype( value, &data[stride[0] * i], &tensor->attr.dtype );
-                if( VSI_FAILURE == status )
+                 memset(data, data[0], size);
+            }
+            else
+            {
+                for( i = 1; i < elements; i ++ )
                 {
-                    VSILOGE("Convert value to dtype fail");
-                    break;
+                    for(j=0;j<stride[0];j++)
+                    {
+                        data[stride[0] * i + j] = data[j];
+                    }
                 }
             }
-
             status = vsi_nn_CopyDataToTensor( graph, tensor, data );
             free( data );
             data = NULL;
@@ -1075,6 +1085,11 @@ void vsi_nn_SaveTensorToTextByFp32
     }
 
     fp = fopen( filename, "w" );
+    if( NULL == fp )
+    {
+        VSILOGW( "Write file %s fail. Please check...", filename );
+        return;
+    }
     sz = vsi_nn_GetElementNum( tensor );
 
     ptr = data;
@@ -1160,6 +1175,11 @@ void vsi_nn_SaveDataToText
     }
 
     fp = fopen( filename, "w" );
+    if( NULL == fp )
+    {
+        VSILOGW( "Write file %s fail. Please check...", filename );
+        return;
+    }
     type_bytes = vsi_nn_GetTypeBytes( type );
 
     count = 0;
@@ -1212,6 +1232,11 @@ void vsi_nn_SaveTensorToBinary
     }
 
     fp = fopen( filename, "wb" );
+    if( NULL == fp )
+    {
+        VSILOGW( "Write file %s fail. Please check...", filename );
+        return;
+    }
     sz = vsi_nn_GetTypeBytes( tensor->attr.dtype.vx_type );
     for( i = 0; i < tensor->attr.dim_num; i ++ )
     {