From 2d9e614a0625ae041a16abf9024507982c3f78cd Mon Sep 17 00:00:00 2001
From: Chen Feiyue <69809761+chenfeiyue-cfy@users.noreply.github.com>
Date: Wed, 3 Jan 2024 13:13:15 +0800
Subject: [PATCH] Update internal ovxlib to rel/1.2.2 (#674)

Update to SHA:806fcd6a69d333e62508acf0a6aa2c38c8385eae

Type: Code Improvement

Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>
---
 src/tim/vx/internal/.gitignore                |    3 +
 src/tim/vx/internal/include/interface/ops.def |    2 +
 .../include/kernel/vsi_nn_kernel_lut.h        |    1 +
 .../vsi_nn_op_bidirectional_sequence_lstm.h   |   11 +
 .../include/ops/vsi_nn_op_crop_and_resize.h   |   47 +
 .../include/ops/vsi_nn_op_lstm_ovxlib.h       |    5 +
 .../include/ops/vsi_nn_op_lstmunit_ovxlib.h   |    5 +
 .../internal/include/ops/vsi_nn_op_resize.h   |    3 +-
 .../include/ops/vsi_nn_op_scatter_nd_update.h |    1 +
 .../vx/internal/include/utils/vsi_nn_util.h   |    6 +
 .../vx/internal/include/vip/virtual_device.h  |    1 +
 src/tim/vx/internal/include/vsi_nn_context.h  |    2 +
 .../internal/include/vsi_nn_feature_config.h  |   33 +
 src/tim/vx/internal/include/vsi_nn_graph.h    |   37 +
 src/tim/vx/internal/include/vsi_nn_node.h     |   16 +
 .../vx/internal/include/vsi_nn_node_type.h    |    2 +
 src/tim/vx/internal/include/vsi_nn_platform.h |    3 +
 .../include/vsi_nn_pre_post_process.h         |    2 +
 src/tim/vx/internal/include/vsi_nn_pub.h      |    5 +
 .../vx/internal/include/vsi_nn_tensor_util.h  |   76 +
 src/tim/vx/internal/include/vsi_nn_types.h    |   14 +-
 src/tim/vx/internal/include/vsi_nn_version.h  |    4 +-
 src/tim/vx/internal/src/Android.mk            |   20 +-
 .../internal/src/kernel/cl/comparisons_cl.c   |   12 +
 .../src/kernel/cl/crop_and_resize_cl.c        |  359 +
 .../src/kernel/cl/depth2space_internal_cl.c   |    4 +-
 .../src/kernel/cl/detect_post_box_cl.c        |  300 -
 .../src/kernel/cl/detect_post_nms_cl.c        |  197 -
 .../internal/src/kernel/cl/eltwise_unary_cl.c |   11 +
 src/tim/vx/internal/src/kernel/cl/gather_cl.c |    3 +-
 .../src/kernel/cl/grucell_activation_z_h_cl.c |   12 +
 .../cl/grucell_reset_after_activation_cl.c    |    4 +
 .../src/kernel/cl/layer_normalization_cl.c    |    3 +-
 .../internal/src/kernel/cl/log_softmax_cl.c   |  198 +-
 .../vx/internal/src/kernel/cl/matrixmul_cl.c  |   73 +-
 .../vx/internal/src/kernel/cl/maximum_cl.c    |   67 +-
 .../vx/internal/src/kernel/cl/minimum_cl.c    |   66 +-
 src/tim/vx/internal/src/kernel/cl/pow_cl.c    |    3 +-
 .../internal/src/kernel/cl/resize_cubic_cl.c  |  320 +
 .../cl/scatter_nd_update_reduction_cl.c       |  727 +++
 src/tim/vx/internal/src/kernel/cl/select_cl.c |    2 +
 src/tim/vx/internal/src/kernel/cl/tile_cl.c   |    2 +
 src/tim/vx/internal/src/kernel/cl/topk_cl.c   |    2 +-
 .../src/kernel/evis/add_mean_std_norm_evis.c  |   98 +-
 .../src/kernel/evis/batchnorm_single_evis.c   |   38 +-
 .../kernel/evis/bilinear_grid_sample_evis.c   |   83 +-
 .../vx/internal/src/kernel/evis/clip_evis.c   |    1 -
 .../src/kernel/evis/comparisons_evis.c        |   41 +-
 .../src/kernel/evis/conv1d_ovxlib_evis.c      |   23 +-
 .../src/kernel/evis/crop_and_resize_evis.c    |  540 ++
 .../vx/internal/src/kernel/evis/cumsum_evis.c |   36 +-
 .../kernel/evis/depth2space_internal_evis.c   |   53 +-
 .../src/kernel/evis/depthwise_conv1d_evis.c   |   10 +-
 .../src/kernel/evis/detect_post_box_evis.c    |   15 +-
 .../src/kernel/evis/eltwise_unary_evis.c      |   57 +-
 .../vx/internal/src/kernel/evis/erf_evis.c    |   39 +-
 .../internal/src/kernel/evis/floordiv_evis.c  |   62 +-
 .../vx/internal/src/kernel/evis/gather_evis.c |  101 +-
 .../internal/src/kernel/evis/gather_nd_evis.c |   37 +-
 .../src/kernel/evis/grucell_activation_evis.c |   74 +-
 .../kernel/evis/grucell_activation_z_h_evis.c |   37 +-
 .../evis/grucell_h_times_activation_r_evis.c  |   15 +-
 .../grucell_reset_after_activation_evis.c     |   36 +-
 .../src/kernel/evis/l2normalizescale_evis.c   |   40 +-
 .../kernel/evis/layer_normalization_evis.c    |   64 +-
 .../src/kernel/evis/log_softmax_evis.c        |  610 +-
 .../kernel/evis/lstmunit_activation_evis.c    |   43 +-
 .../internal/src/kernel/evis/matrixmul_evis.c |  133 +-
 .../internal/src/kernel/evis/maximum_evis.c   |  107 +-
 .../internal/src/kernel/evis/minimum_evis.c   |  107 +-
 .../vx/internal/src/kernel/evis/mod_evis.c    |   62 +-
 .../internal/src/kernel/evis/moments_evis.c   |   76 +-
 .../internal/src/kernel/evis/one_hot_evis.c   |    7 +-
 .../src/kernel/evis/poolwithargmax_evis.c     |   30 +-
 .../vx/internal/src/kernel/evis/pow_evis.c    |   67 +-
 .../src/kernel/evis/pre_process_bgra_evis.c   |   24 +-
 .../src/kernel/evis/pre_process_gray_evis.c   |   54 +-
 .../kernel/evis/pre_process_nv12_rggb_evis.c  |  884 +++
 .../evis/pre_process_rgb888_planar_evis.c     |   23 +-
 .../pre_process_rgb888_planar_nhwc_evis.c     |   22 +-
 .../src/kernel/evis/pre_process_rgb_evis.c    |   24 +-
 .../src/kernel/evis/pre_process_yuv444_evis.c |   45 +-
 .../vx/internal/src/kernel/evis/prelu_evis.c  |   40 +-
 .../src/kernel/evis/reducemax_internal_evis.c |   72 +-
 .../src/kernel/evis/reducemin_internal_evis.c |   71 +-
 .../kernel/evis/reduceprod_internal_evis.c    |   72 +-
 .../src/kernel/evis/relu_keras_evis.c         |   35 +-
 .../src/kernel/evis/resize_1d_bilinear_evis.c |   55 +-
 .../src/kernel/evis/resize_1d_nearest_evis.c  |   47 +-
 .../src/kernel/evis/resize_bilinear_evis.c    |  203 +-
 .../src/kernel/evis/resize_cubic_evis.c       |  453 ++
 .../src/kernel/evis/resize_nearest_evis.c     |   47 +-
 .../src/kernel/evis/scatter_nd_evis.c         |   10 +-
 .../src/kernel/evis/scatter_nd_update_evis.c  |    8 +-
 .../evis/scatter_nd_update_reduction_evis.c   |  861 +++
 .../vx/internal/src/kernel/evis/select_evis.c |   62 +-
 .../src/kernel/evis/sequence_mask_evis.c      |   40 +-
 .../vx/internal/src/kernel/evis/slice_evis.c  |   41 +-
 .../kernel/evis/spatial_transformer_evis.c    |   59 +-
 .../vx/internal/src/kernel/evis/swish_evis.c  |   82 +-
 .../vx/internal/src/kernel/evis/tile_evis.c   |   42 +-
 .../internal/src/kernel/evis/upsample_evis.c  |   30 +-
 .../src/kernel/evis/upsamplescale_evis.c      |   34 +-
 .../kernel/vsi_nn_kernel_gpu_shape_optimize.c |   17 +-
 .../internal/src/kernel/vsi_nn_kernel_lut.c   |    8 +
 .../src/kernel/vsi_nn_kernel_selector.c       |   16 +-
 .../internal/src/kernel/vsi_nn_kernel_util.c  |   65 +-
 .../internal/src/kernel/vx/eltwise_unary_vx.c |  115 +-
 .../vx/internal/src/kernel/vx/layer_norm_vx.c |   87 +
 .../internal/src/kernel/vx/log_softmax_vx.c   |   85 +
 .../ops/cl/crop_and_resize_bilinear.cl        |  107 +
 .../cl/crop_and_resize_nearest_neighbor.cl    |   77 +
 .../src/libnnext/ops/cl/detect_post_box.cl    |  101 -
 .../src/libnnext/ops/cl/eltwise_unary_0.cl    |    8 +
 .../src/libnnext/ops/cl/eltwise_unary_1.cl    |    8 +
 .../libnnext/ops/cl/grucell_activation_z_h.cl |   27 +-
 .../ops/cl/grucell_reset_after_activation.cl  |   11 +-
 .../ops/cl/log_softmax_exceed_axis0.cl        |  167 +
 .../ops/cl/log_softmax_exceed_axis1.cl        |  172 +
 .../src/libnnext/ops/cl/matrixmul_4x.cl       |  128 +
 .../src/libnnext/ops/cl/resize_cubic.cl       |  195 +
 .../ops/cl/scatter_nd_update_reduction.cl     |  203 +
 .../cl/scatter_nd_update_reduction_conv.cl    |   72 +
 .../vx/internal/src/libnnext/ops/cl/swish.cl  |    2 +-
 .../ops/vx/crop_and_resize_bilinear.vx        |  255 +
 .../vx/crop_and_resize_nearest_neighbor.vx    |  292 +
 .../src/libnnext/ops/vx/eltwise_unary_2d_1.vx |    8 +-
 .../src/libnnext/ops/vx/eltwise_unary_3d_1.vx |    7 +
 .../vx/internal/src/libnnext/ops/vx/gather.vx |   24 +-
 .../src/libnnext/ops/vx/gather_array.vx       |  119 +-
 .../src/libnnext/ops/vx/gather_batch.vx       |   24 +-
 .../src/libnnext/ops/vx/gather_mix.vx         |   18 +-
 .../src/libnnext/ops/vx/gather_mix_batch.vx   |   18 +-
 .../libnnext/ops/vx/grucell_activation_z_h.vx |   15 +-
 .../ops/vx/grucell_reset_after_activation.vx  |   12 +
 .../ops/vx/layer_normalization_axis01_0.vx    |  315 +
 .../ops/vx/layer_normalization_axis01_1.vx    |  317 +
 .../ops/vx/layer_normalization_axis01_2.vx    |  348 +
 .../ops/vx/layer_normalization_axis01_3.vx    |  178 +
 .../ops/vx/layer_normalization_axis01_sum.vx  |  228 +
 .../ops/vx/log_softmax_exceed_axis0.vx        |  190 +
 .../ops/vx/log_softmax_exceed_axis0_BF16.vx   |  187 +
 .../ops/vx/log_softmax_exceed_axis1.vx        |  172 +
 .../ops/vx/log_softmax_exceed_axis1_BF16.vx   |  180 +
 .../ops/vx/pre_process_nv12_rggb_copy.vx      |  111 +
 .../ops/vx/pre_process_nv12_rggb_scale.vx     |  247 +
 .../libnnext/ops/vx/resize_bilinear_F16.vx    |  145 +-
 .../libnnext/ops/vx/resize_bilinear_I16.vx    |   72 +-
 .../src/libnnext/ops/vx/resize_bilinear_I8.vx |   60 +-
 .../libnnext/ops/vx/resize_bilinear_U16.vx    |  278 +
 .../src/libnnext/ops/vx/resize_bilinear_U8.vx |    8 +-
 .../src/libnnext/ops/vx/resize_cubic.vx       |  270 +
 .../ops/vx/scatter_nd_update_reduction.vx     |  259 +
 .../vx/scatter_nd_update_reduction_conv.vx    |  110 +
 .../libnnext/ops/vx/vsi_nn_kernel_header.vx   |   52 +-
 .../src/libnnext/vsi_nn_libnnext_resource.c   | 5761 ++++++++++++++++-
 .../vsi_nn_op_axis_aligned_bbox_transform.c   |   62 +-
 .../vsi_nn_op_bidirectional_sequence_lstm.c   |   29 +
 .../vx/internal/src/ops/vsi_nn_op_concat.c    |   12 +
 .../src/ops/vsi_nn_op_conv2d_lstm_cell.c      |   49 +-
 .../vx/internal/src/ops/vsi_nn_op_conv_relu.c |  298 +-
 .../src/ops/vsi_nn_op_conv_relu_pool.c        |  254 +-
 .../src/ops/vsi_nn_op_crop_and_resize.c       |  193 +
 .../src/ops/vsi_nn_op_depth2space_internal.c  |   18 +-
 .../src/ops/vsi_nn_op_detection_postprocess.c |  141 +-
 .../vx/internal/src/ops/vsi_nn_op_eltwise.c   |    7 +-
 .../src/ops/vsi_nn_op_eltwise_unary.c         |    1 +
 .../src/ops/vsi_nn_op_fullconnect_relu.c      |  313 +-
 .../vx/internal/src/ops/vsi_nn_op_gather.c    |    7 +-
 .../src/ops/vsi_nn_op_generate_proposals.c    |   88 +-
 .../src/ops/vsi_nn_op_grouped_conv1d.c        |   22 +-
 .../src/ops/vsi_nn_op_grouped_conv2d.c        |   10 +
 .../src/ops/vsi_nn_op_groupnormalize.c        |   70 +
 .../src/ops/vsi_nn_op_heatmap_max_keypoint.c  |   71 +-
 .../internal/src/ops/vsi_nn_op_imageprocess.c |   86 +-
 .../src/ops/vsi_nn_op_instancenormalize.c     |   12 +-
 .../src/ops/vsi_nn_op_layernormalize.c        |   13 +
 .../internal/src/ops/vsi_nn_op_log_softmax.c  |   35 +-
 .../internal/src/ops/vsi_nn_op_lstm_ovxlib.c  |   27 +
 .../src/ops/vsi_nn_op_lstmunit_ovxlib.c       |   21 +-
 .../vx/internal/src/ops/vsi_nn_op_permute.c   |   11 +
 .../internal/src/ops/vsi_nn_op_pre_process.c  |   20 +-
 .../src/ops/vsi_nn_op_pre_process_nv12.c      |   10 +-
 .../src/ops/vsi_nn_op_quantized_16bit_lstm.c  |   65 +-
 .../vx/internal/src/ops/vsi_nn_op_reshape.c   |    7 +-
 .../vx/internal/src/ops/vsi_nn_op_reshape2.c  |    7 +-
 .../vx/internal/src/ops/vsi_nn_op_resize.c    |    4 +
 .../src/ops/vsi_nn_op_scatter_nd_update.c     |   27 +-
 .../src/ops/vsi_nn_op_strided_slice.c         |    7 +-
 src/tim/vx/internal/src/ops/vsi_nn_op_topk.c  |   80 +-
 .../quantization/vsi_nn_asymmetric_affine.c   |    3 +
 .../src/utils/vsi_nn_code_generator.c         |    2 +
 src/tim/vx/internal/src/utils/vsi_nn_util.c   |   27 +
 .../vx/internal/src/vip/virtual_device.cpp    |    4 +
 src/tim/vx/internal/src/vsi_nn_context.c      |   99 +-
 src/tim/vx/internal/src/vsi_nn_graph.c        |  744 ++-
 src/tim/vx/internal/src/vsi_nn_log.c          |   30 +-
 src/tim/vx/internal/src/vsi_nn_node.c         |   24 +-
 .../internal/src/vsi_nn_node_attr_template.c  |    1 +
 .../vx/internal/src/vsi_nn_pre_post_process.c |   16 +-
 src/tim/vx/internal/src/vsi_nn_tensor.c       |  179 +-
 .../vx/internal/src/vsi_nn_tensor_util_prv.h  |   20 +
 src/tim/vx/internal/src/vsi_nn_types_prv.h    |   12 +
 203 files changed, 18939 insertions(+), 5096 deletions(-)
 create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_crop_and_resize.h
 create mode 100644 src/tim/vx/internal/src/kernel/cl/crop_and_resize_cl.c
 delete mode 100644 src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c
 delete mode 100644 src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c
 create mode 100644 src/tim/vx/internal/src/kernel/cl/resize_cubic_cl.c
 create mode 100644 src/tim/vx/internal/src/kernel/cl/scatter_nd_update_reduction_cl.c
 create mode 100644 src/tim/vx/internal/src/kernel/evis/crop_and_resize_evis.c
 create mode 100644 src/tim/vx/internal/src/kernel/evis/pre_process_nv12_rggb_evis.c
 create mode 100644 src/tim/vx/internal/src/kernel/evis/resize_cubic_evis.c
 create mode 100644 src/tim/vx/internal/src/kernel/evis/scatter_nd_update_reduction_evis.c
 create mode 100644 src/tim/vx/internal/src/kernel/vx/layer_norm_vx.c
 create mode 100644 src/tim/vx/internal/src/kernel/vx/log_softmax_vx.c
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/crop_and_resize_bilinear.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/crop_and_resize_nearest_neighbor.cl
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/detect_post_box.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_exceed_axis0.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_exceed_axis1.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/resize_cubic.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd_update_reduction.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd_update_reduction_conv.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/crop_and_resize_bilinear.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/crop_and_resize_nearest_neighbor.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_0.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_1.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_2.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_3.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_sum.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis0.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis0_BF16.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis1.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis1_BF16.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_rggb_copy.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_rggb_scale.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U16.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/resize_cubic.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_reduction.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_reduction_conv.vx
 create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_crop_and_resize.c

diff --git a/src/tim/vx/internal/.gitignore b/src/tim/vx/internal/.gitignore
index 6858186..665a34f 100644
--- a/src/tim/vx/internal/.gitignore
+++ b/src/tim/vx/internal/.gitignore
@@ -3,6 +3,9 @@
 ##
 ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
 
+# Some header file
+include/vsi_nn_feature_config.h
+
 # User-specific files
 *.suo
 *.user
diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def
index 0a1424e..6c879e9 100755
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@@ -195,3 +195,5 @@ DEF_OP(GRID_SAMPLE)
 DEF_OP(LPNORM)
 DEF_OP(RESIZE_3D)
 DEF_OP(REDUCEL2)
+DEF_OP(CROP_AND_RESIZE)
+DEF_OP(TAN)
diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
index 8b8c055..3143b41 100644
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
@@ -55,6 +55,7 @@ typedef int32_t vsi_nn_kernel_lut_act_e; enum
     VSI_NN_KERNEL_LUT_ATANH            = 21,
     VSI_NN_KERNEL_LUT_ACOSH            = 22,
     VSI_NN_KERNEL_LUT_INVERSE_SIGMOID  = 23,
+    VSI_NN_KERNEL_LUT_TAN              = 24,
 
 };
 
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_lstm.h b/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_lstm.h
index 8a4e7cb..22bed7e 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_lstm.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_lstm.h
@@ -106,10 +106,21 @@ enum
     BI_LSTM_BW_INPUT_LAYERNORM_C    = 54,
     BI_LSTM_BW_INPUT_LAYERNORM_O    = 55,
 
+    BI_LSTM_FW_INPUT_BIAS_R2I       = 56,
+    BI_LSTM_FW_INPUT_BIAS_R2F       = 57,
+    BI_LSTM_FW_INPUT_BIAS_R2C       = 58,
+    BI_LSTM_FW_INPUT_BIAS_R2O       = 59,
+
+    BI_LSTM_BW_INPUT_BIAS_R2I       = 60,
+    BI_LSTM_BW_INPUT_BIAS_R2F       = 61,
+    BI_LSTM_BW_INPUT_BIAS_R2C       = 62,
+    BI_LSTM_BW_INPUT_BIAS_R2O       = 63,
+
     BI_LSTM_INPUT_CNT,
 
     BI_LSTM_FW_OUTPUT_OUTPUT      = 0,
     BI_LSTM_BW_OUTPUT_OUTPUT      = 1,
+
     BI_LSTM_OUTPUT_CNT
 };
 
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_crop_and_resize.h b/src/tim/vx/internal/include/ops/vsi_nn_op_crop_and_resize.h
new file mode 100644
index 0000000..aa12459
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_crop_and_resize.h
@@ -0,0 +1,47 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_CROP_AND_RESIZE_H
+#define _VSI_NN_OP_CROP_AND_RESIZE_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_crop_and_resize_param
+{
+    struct _crop_and_resize_local_data_t * lcl_data;
+    const int32_t* crop_size;
+    vsi_enum resize_method;
+    float extrapolation_value;
+} vsi_nn_crop_and_resize_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_lstm_ovxlib.h b/src/tim/vx/internal/include/ops/vsi_nn_op_lstm_ovxlib.h
index 29c8cd1..19c45a1 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_lstm_ovxlib.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lstm_ovxlib.h
@@ -70,6 +70,11 @@ enum
     LSTM_INPUT_AUX_WEIGHT_I2C = 27,
     LSTM_INPUT_AUX_WEIGHT_I2O = 28,
 
+    LSTM_INPUT_BIAS_R2I       = 29,
+    LSTM_INPUT_BIAS_R2F       = 30,
+    LSTM_INPUT_BIAS_R2C       = 31,
+    LSTM_INPUT_BIAS_R2O       = 32,
+
     LSTM_INPUT_CNT,
 
     LSTM_OUTPUT_OUTPUT      = 0,
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_ovxlib.h b/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_ovxlib.h
index cc53d4c..bc23d65 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_ovxlib.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_ovxlib.h
@@ -74,6 +74,11 @@ enum
     LSTMUNIT_INPUT_AUX_WEIGHT_I2C = 27,
     LSTMUNIT_INPUT_AUX_WEIGHT_I2O = 28,
 
+    LSTMUNIT_INPUT_BIAS_R2I       = 29,
+    LSTMUNIT_INPUT_BIAS_R2F       = 30,
+    LSTMUNIT_INPUT_BIAS_R2C       = 31,
+    LSTMUNIT_INPUT_BIAS_R2O       = 32,
+
     LSTMUNIT_INPUT_CNT,
 
     LSTMUNIT_OUTPUT_OUTPUT      = 0,
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_resize.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize.h
index aaa72c6..b8d19d5 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize.h
@@ -38,7 +38,8 @@ typedef uint32_t vsi_nn_interpolation_type_t; enum
 {
     VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR = 0,
     VSI_NN_INTERPOLATION_BILINEAR,
-    VSI_NN_INTERPOLATION_AREA
+    VSI_NN_INTERPOLATION_AREA,
+    VSI_NN_INTERPOLATION_CUBIC
 };
 
 typedef uint32_t vsi_nn_resize_layout_type_t; enum
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_nd_update.h b/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_nd_update.h
index 68e1b29..7121b3b 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_nd_update.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_nd_update.h
@@ -33,6 +33,7 @@ extern "C" {
 typedef struct _vsi_nn_scatter_nd_update_param
 {
     vsi_bool use_locking;
+    vsi_nn_reduction_type_e reduction;
 } vsi_nn_scatter_nd_update_param;
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_util.h b/src/tim/vx/internal/include/utils/vsi_nn_util.h
index 128e7d0..007983c 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_util.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h
@@ -471,6 +471,12 @@ char* vsi_nn_getenv
     const char * var_name
     );
 
+int32_t vsi_nn_getenv_asint
+    (
+        const char* env,
+        int32_t default_value
+    );
+
 FILE* vsi_nn_fopen
     (
     const char * file_name,
diff --git a/src/tim/vx/internal/include/vip/virtual_device.h b/src/tim/vx/internal/include/vip/virtual_device.h
index a91ef83..4d138c6 100644
--- a/src/tim/vx/internal/include/vip/virtual_device.h
+++ b/src/tim/vx/internal/include/vip/virtual_device.h
@@ -43,6 +43,7 @@ class IDevice {
         OVXLIB_API IDevice(uint32_t id);
         OVXLIB_API ~IDevice();
         OVXLIB_API uint32_t Id() const;
+        OVXLIB_API bool GraphSubmit(vsi_nn_graph_t* graph, bool (*func)(const void*), data_t data);
         OVXLIB_API bool GraphSubmit(vsi_nn_graph_t* graph, func_t func, data_t data);
         OVXLIB_API bool GraphRemove(const vsi_nn_graph_t* graph);
         OVXLIB_API bool ThreadExit();
diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h
index 777cf5c..4ac9f61 100644
--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@@ -79,6 +79,8 @@ typedef struct _vsi_nn_runtime_option_t
     int32_t enable_dataconvert_optimize;
     int32_t enable_stream_processor;
     int32_t enable_rgb88_planar_nhwc;
+    int32_t enable_slice_optimize;
+    int32_t enable_batch_opt;
 } vsi_nn_runtime_option_t;
 
 /**
diff --git a/src/tim/vx/internal/include/vsi_nn_feature_config.h b/src/tim/vx/internal/include/vsi_nn_feature_config.h
index e93d1af..7918ae3 100644
--- a/src/tim/vx/internal/include/vsi_nn_feature_config.h
+++ b/src/tim/vx/internal/include/vsi_nn_feature_config.h
@@ -1,3 +1,26 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the Software),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
 /*****Auto generated header file, Please DO NOT modify manually!*****/
 #ifndef _VSI_NN_FEATURE_CONFIG_H
 #define _VSI_NN_FEATURE_CONFIG_H
@@ -20,5 +43,15 @@
 #define VSI_CONCAT_ENHANCE_SUPPORT
 #endif
 #define VSI_CREATE_TENSOR_FROM_VIEW_SUPPORT
+#ifndef VSI_SWAP_HANDLE_CACHE_SUPPORT
+#define VSI_SWAP_HANDLE_CACHE_SUPPORT
+#endif
+#define VSI_EXPORT_APIS_FOR_SETUP_GRAPH 1
+#if defined(VX_SET_TENSOR_MEMPOOL_TYPE_SUPPORT) && VX_SET_TENSOR_MEMPOOL_TYPE_SUPPORT
+#define VSI_CREATE_TENSOR_FROM_AXISRAM_SUPPORT
+#endif
+#if defined(VX_13_NN_COMPATIBLITY)
+#define VSI_MAP_TENSOR_PATCH_SUPPORT
+#endif
 
 #endif
diff --git a/src/tim/vx/internal/include/vsi_nn_graph.h b/src/tim/vx/internal/include/vsi_nn_graph.h
index 4053988..89786c4 100644
--- a/src/tim/vx/internal/include/vsi_nn_graph.h
+++ b/src/tim/vx/internal/include/vsi_nn_graph.h
@@ -382,6 +382,31 @@ OVXLIB_API vsi_nn_tensor_id_t vsi_nn_AddTensorFromView
     vsi_size_t* end
     );
 
+/**
+ * Add a new tensor from AXI-SRAM
+ * Create a new tensor from internal AXI-SRAM and add it to graph.
+ * It just creates the tensor object and does not actually allocate the memory
+ * in AXI-SRAM until the verify graph stage. In the other words, the tensor object is
+ * created beforehand,but the memory for storing its data is not allocate until verify
+ * graph stage. AXI-SRAM is the internal memory resource that memory allocation is done
+ * strategically to optimize performance and resource usage in graph verification.
+ * If there is no enough memory in AXI-SRAM, vsi_nn_VerifyGraph will return VSI_FAILURE
+ * User can't access the tensor memory(read/write tensor data) before the graph has verified,
+ * since the tensor memory is not allocated.
+ * @param[in] graph Graph handle
+ * @param[in] id Optional id to the tensor, set it to VSI_NN_TENSOR_ID_AUTO,
+ *           and a new id will be generated.
+ * @param[in] attr Tensor attirbutes to the new tensor.
+ *
+ * @return The new tensor id on success, or VSI_NN_TENSOR_ID_NA otheriwse.
+ */
+OVXLIB_API vsi_nn_tensor_id_t vsi_nn_AddTensorFromAXISRAM
+    (
+    vsi_nn_graph_t       * graph,
+    vsi_nn_tensor_id_t     id,
+    vsi_nn_tensor_attr_t * attr
+    );
+
 /**
  * Attach tensor to graph
  * Attach an exist tensor to graph.
@@ -796,6 +821,18 @@ OVXLIB_API vsi_status vsi_nn_SetGraphTransformOption
     size_t size
     );
 
+/**
+ * graph shape inference
+ *
+ * @param[in] graph Graph handle
+ *
+ * @return VSI_SUCCESS on success, or appropriate error code otherwise
+ * */
+OVXLIB_API vsi_status vsi_nn_InferShape
+(
+    vsi_nn_graph_t* graph
+);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/tim/vx/internal/include/vsi_nn_node.h b/src/tim/vx/internal/include/vsi_nn_node.h
index 0a69dbd..3756c7c 100644
--- a/src/tim/vx/internal/include/vsi_nn_node.h
+++ b/src/tim/vx/internal/include/vsi_nn_node.h
@@ -155,6 +155,22 @@ OVXLIB_API void vsi_nn_PrintNode
     vsi_nn_node_id_t id
     );
 
+#if VX_GRAPH_BATCH_OPT_SUPPORT
+/**
+ * Set how much this node is divided into on batch dim.
+ *
+ * @param[in] node Node.
+ * @param[in] split_num.
+ *
+ * @return VSI_SUCCESS on success, or error core otherwise.
+ */
+OVXLIB_API vsi_status vsi_nn_SetNodeBatchSplitNum
+(
+    vsi_nn_node_t* node,
+    int8_t split_num
+);
+#endif
+
 /**
  * Update node attribute
  * Update openvx node attribute based on ovxlib's node attribute
diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h
index f961835..173be94 100644
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@@ -209,6 +209,7 @@
 #include "ops/vsi_nn_op_lpnorm.h"
 #include "ops/vsi_nn_op_resize_3d.h"
 #include "ops/vsi_nn_op_reducel2.h"
+#include "ops/vsi_nn_op_crop_and_resize.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"
 #include "ops/vsi_nn_op_inverse_sigmoid.h"
@@ -406,6 +407,7 @@ typedef union _vsi_nn_nn_param
     vsi_nn_lpnorm_param             lpnorm;
     vsi_nn_resize_3d_param          resize_3d;
     vsi_nn_reducel2_param           reducel2;
+    vsi_nn_crop_and_resize_param    crop_and_resize;
     void*                         client_param;
 
     /* custom node data struct define */
diff --git a/src/tim/vx/internal/include/vsi_nn_platform.h b/src/tim/vx/internal/include/vsi_nn_platform.h
index f5548c8..077c148 100644
--- a/src/tim/vx/internal/include/vsi_nn_platform.h
+++ b/src/tim/vx/internal/include/vsi_nn_platform.h
@@ -35,6 +35,9 @@
 #if defined(VX_KHR_COMPATIBILITY) && (0x1==VX_KHR_COMPATIBILITY)
 #include <VX/vx_khr_compatible.h>
 #endif
+#ifdef VSI_CREATE_TENSOR_FROM_AXISRAM_SUPPORT
+#include <VX/vx_viv_sys.h>
+#endif
 
 /*
     This is a compatibility head file for backward compatibility OpenVX 1.1 spec
diff --git a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
index 9cfae60..6832d86 100644
--- a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
+++ b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
@@ -89,6 +89,8 @@ typedef enum
     VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422,
     VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422,
     VSI_NN_SOURCE_FORMAT_IMAGE_NV21,
+    VSI_NN_SOURCE_FORMAT_IMAGE_NV12_RGGB,
+    VSI_NN_SOURCE_FORMAT_IMAGE_NV21_BGGR,
 } vsi_nn_preprocess_source_format_e;
 
 /**
diff --git a/src/tim/vx/internal/include/vsi_nn_pub.h b/src/tim/vx/internal/include/vsi_nn_pub.h
index 48525a4..1b48062 100644
--- a/src/tim/vx/internal/include/vsi_nn_pub.h
+++ b/src/tim/vx/internal/include/vsi_nn_pub.h
@@ -54,5 +54,10 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "quantization/vsi_nn_asymmetric_affine.h"
 #include "quantization/vsi_nn_dynamic_fixed_point.h"
+
+#if defined(VSI_ENABLE_LCOV_TEST) && VSI_ENABLE_LCOV_TEST
+#include "lcov/vsi_nn_coverage.h"
+#endif
+
 #endif
 
diff --git a/src/tim/vx/internal/include/vsi_nn_tensor_util.h b/src/tim/vx/internal/include/vsi_nn_tensor_util.h
index 3441489..4c88f95 100644
--- a/src/tim/vx/internal/include/vsi_nn_tensor_util.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor_util.h
@@ -817,6 +817,82 @@ vsi_nn_tensor_t * vsi_nn_dropout_tensor
     float             rate
     );
 
+/**
+ * Allows the application to get direct access to a patch of tensor object.
+ * A wrapper api for OpenVX vxMapTensorPatch
+ *
+ * @param[in] graph Graph handle.
+ * @param[in] tensor Tensor handle.
+ * @param[out]  ptr The address of a pointer that the function sets to the
+ * address where the requested data can be accessed. The returned (*ptr) address
+ * is only valid between the call to the function and the corresponding call to
+ * vsi_nn_UnmapTensorPatch.
+ * @param [in] usage This declares the access mode for the tensor patch, using
+ * the vsi_nn_accessor_type_e enumeration.
+ * VSI_NN_READ_ONLY: after the function call, the content of the memory location
+ * pointed by (*ptr) contains the tensor patch data. Writing into this memory location
+ * is forbidden and its behavior is undefined.
+ * VSI_NN_READ_AND_WRITE : after the function call, the content of the memory
+ * location pointed by (*ptr) contains the tensor patch data; writing into this memory
+ * is allowed only for the location of items and will result in a modification of the
+ * affected items in the tensor object once the range is unmapped. Writing into
+ * a gap between items (when (*stride) > item size in bytes) is forbidden and its
+ * behavior is undefined.
+ * VSI_NN_WRITE_ONLY: after the function call, the memory location pointed by (*ptr)
+ * contains undefined data; writing each item of the range is required prior to
+ * unmapping. Items not written by the application before unmap will become
+ * undefined after unmap, even if they were well defined before map. Like for
+ * VSI_NN_READ_AND_WRITE, writing into a gap between items is forbidden and its behavior
+ * is undefined.
+ * @return VSI_SUCCESS on success, or error core otherwise.
+ */
+
+OVXLIB_API vsi_status vsi_nn_MapTensorPatch
+   (
+    vsi_nn_graph_t* graph,
+    vsi_nn_tensor_t* tensor,
+    void** ptr,
+    vsi_nn_accessor_type_e usage
+   );
+
+/**
+ * Unmap and commit potential changes to a tensor object patch that was previously mapped.
+ * Unmapping a tensor patch invalidates the memory location from which the patch could
+ * be accessed by the application. Accessing this memory location after the unmap function
+ * completes has an undefined behavior.
+ * @param[in] graph Graph handle.
+ * @param [in] tensor The reference to the tensor object to unmap.
+ * return VSI_SUCCESS on success, or error core otherwise.
+ */
+
+OVXLIB_API vsi_status vsi_nn_UnmapTensorPatch
+   (
+   vsi_nn_graph_t* graph,
+   vsi_nn_tensor_t* tensor
+   );
+
+/**
+ * Create a new tensor from internal AXI-SRAM(Kernel driver maped)
+ * It just creates the tensor object and does not actually allocate the memory
+ * in AXI-SRAM until the verify graph stage. In the other words, the tensor
+ * object is created beforehand,but the memory for storing its data is not
+ * allocate until verify graph stage. AXI-SRAM is the internal memory resource
+ * that memory allocation is done strategically to optimize performance and
+ * resource usage in graph verification.
+ * If there is no enough memory in AXI-SRAM, vsi_nn_VerifyGraph will return VSI_FAILURE
+ * User can't access the tensor memory(read/write tensor data) before the graph has verified,
+ * since the tensor memory is not allocated.
+ * @param[in] graph Graph handle
+ * @param[in] attr Tensor attirbutes to the new tensor.
+ *
+ * @return Tensor handle on success, or NULL otherwise.
+ */
+OVXLIB_API vsi_nn_tensor_t * vsi_nn_CreateTensorFromAXISRAM
+    (
+    vsi_nn_graph_t       * graph,
+    vsi_nn_tensor_attr_t * attr
+    );
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/tim/vx/internal/include/vsi_nn_types.h b/src/tim/vx/internal/include/vsi_nn_types.h
index 380057b..4e0b58b 100644
--- a/src/tim/vx/internal/include/vsi_nn_types.h
+++ b/src/tim/vx/internal/include/vsi_nn_types.h
@@ -115,7 +115,9 @@ typedef enum
 {
     VSI_NN_REDUCTION_TYPE_NONE,
     VSI_NN_REDUCTION_TYPE_ADD,
-    VSI_NN_REDUCTION_TYPE_MUL
+    VSI_NN_REDUCTION_TYPE_MUL,
+    VSI_NN_REDUCTION_TYPE_MAX,
+    VSI_NN_REDUCTION_TYPE_MIN
 } vsi_nn_reduction_type_e;
 
 /** Pad mode enum */
@@ -269,7 +271,9 @@ typedef enum _vsi_nn_yuv_type
 typedef enum _vsi_nn_nv_type
 {
     VSI_NN_YUV_TYPE_NV12,
-    VSI_NN_YUV_TYPE_NV21
+    VSI_NN_YUV_TYPE_NV21,
+    VSI_NN_YUV_TYPE_NV12_RGGB,
+    VSI_NN_YUV_TYPE_NV21_BGGR
 }vsi_nn_nv_type;
 
 typedef enum _vsi_nn_roi_align_type_e
@@ -283,6 +287,12 @@ typedef enum _vsi_nn_custom_warp_affine_type_e {
     VSI_NN_WARP_AFFINE_TYPE_RGB
 } vsi_nn_custom_warp_affine_type_e;
 
+typedef enum _vsi_nn_accessor_type_e {
+    VSI_NN_READ_ONLY = VX_READ_ONLY,
+    VSI_NN_WRITE_ONLY = VX_WRITE_ONLY,
+    VSI_NN_READ_AND_WRITE = VX_READ_AND_WRITE
+} vsi_nn_accessor_type_e;
+
 /** Deprecated */
 typedef uint32_t vsi_nn_size_t;
 
diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h
index 97fd959..2b7e1bd 100644
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@@ -32,8 +32,8 @@ extern "C"{
 #endif
 
 #define VSI_NN_VERSION_MAJOR 1
-#define VSI_NN_VERSION_MINOR 1
-#define VSI_NN_VERSION_PATCH 88
+#define VSI_NN_VERSION_MINOR 2
+#define VSI_NN_VERSION_PATCH 2
 #define VSI_NN_VERSION \
     (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
 
diff --git a/src/tim/vx/internal/src/Android.mk b/src/tim/vx/internal/src/Android.mk
index a1b3683..6c425f6 100644
--- a/src/tim/vx/internal/src/Android.mk
+++ b/src/tim/vx/internal/src/Android.mk
@@ -14,6 +14,10 @@ ifeq ($(PLATFORM_VENDOR),1)
 LOCAL_VENDOR_MODULE  := true
 endif
 
+$(info Remove $(LOCAL_PATH)/../include/vsi_nn_feature_config.h ...)
+$(shell rm $(LOCAL_PATH)/../include/vsi_nn_feature_config.h -rf)
+$(info $(shell bash $(LOCAL_PATH)/../gcc_gen_feature_config_header.sh $(LOCAL_PATH)/..))
+
 LOCAL_SRC_FILES :=     \
             vsi_nn_context.c \
             vsi_nn_client_op.c \
@@ -59,12 +63,6 @@ LOCAL_SRC_FILES +=      \
             post/vsi_nn_post_fasterrcnn.c   \
             post/vsi_nn_post_cmupose.c
 
-LOCAL_SRC_FILES +=      \
-            cpu_backend/vsi_nn_cpu_backend.c   \
-            cpu_backend/vsi_nn_cpu_backend_conv2d.c   \
-            cpu_backend/vsi_nn_cpu_backend_deconv2d.c   \
-            cpu_backend/npuref_interface.c
-
 
 LOCAL_SRC_FILES += libnnext/vsi_nn_libnnext_resource.c \
                    libnnext/vsi_nn_vxkernel.c
@@ -78,11 +76,10 @@ LOCAL_SRC_FILES += kernel/vsi_nn_kernel.c \
                    kernel/vsi_nn_kernel_param.c \
                    kernel/vsi_nn_kernel_gpu_shape_optimize.c \
                    kernel/vsi_nn_kernel_lut.c \
-                   kernel/vsi_nn_spinst.c \
-                   kernel/vsi_nn_sp_unit_operation.c \
-                   kernel/vsi_nn_sp_lut.c \
                    kernel/vsi_nn_gpu.c
 
+LOCAL_SRC_FILES += vip/virtual_device.cpp
+
 LIBNNEXT_KERNEL_SOURCES := $(wildcard $(LOCAL_PATH)/libnnext/ops/kernel/*.c)
 LOCAL_SRC_FILES += $(LIBNNEXT_KERNEL_SOURCES:$(LOCAL_PATH)/%=%)
 
@@ -117,13 +114,14 @@ LOCAL_C_INCLUDES += \
     $(AQROOT)/sdk/inc/ \
     $(AQROOT)/sdk/inc/HAL \
     $(LOCAL_PATH)/../include \
+    $(LOCAL_PATH)/../include/vip \
     $(LOCAL_PATH)/../include/ops \
     $(LOCAL_PATH)/../include/utils \
     $(LOCAL_PATH)/../include/infernce \
     $(LOCAL_PATH)/../include/client \
-    $(LOCAL_PATH)/../include/cpu_backend \
     $(LOCAL_PATH)/../include/libnnext \
-    $(LOCAL_PATH)/../src
+    $(LOCAL_PATH)/../src \
+    $(LOCAL_PATH)/../src/vip
 
 LOCAL_CFLAGS :=  \
     -DLINUX \
diff --git a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
index 4b1369f..7ad273d 100644
--- a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
@@ -22,6 +22,7 @@
 *
 *****************************************************************************/
 
+#if !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -293,6 +294,16 @@ static vsi_status _query_kernel
     input1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
+    if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE && input0_dtype == I16)
+    {
+        input0_dtype = I32;
+    }
+
+    if (inputs[1]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE && input1_dtype == I16)
+    {
+        input1_dtype = I32;
+    }
+
     if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE && output_dtype == I8)
     {
         output_dtype = BOOL8;
@@ -452,3 +463,4 @@ final:
 REGISTER_BACKEND_CL( relational_ops, _setup )
 
 __END_DECLS
+#endif
diff --git a/src/tim/vx/internal/src/kernel/cl/crop_and_resize_cl.c b/src/tim/vx/internal/src/kernel/cl/crop_and_resize_cl.c
new file mode 100644
index 0000000..4adcbce
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/crop_and_resize_cl.c
@@ -0,0 +1,359 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+typedef enum _crop_and_resize_type_e
+{
+    nearest_neighbor = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR,
+    bilinear = VSI_NN_INTERPOLATION_BILINEAR,
+}crop_and_resize_type_e;
+
+#define _CROP_AND_RESIZE_KERNEL_SOURCE_NAME      "crop_and_resize_"
+
+// Add kernel hashtable here
+#define CROP_AND_RESIZE_HASH_KEY( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ) \
+        (( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8) | (RESIZE_METHOD))
+#define CROP_AND_RESIZE_KERNEL( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ) \
+        { CROP_AND_RESIZE_HASH_KEY( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ), \
+          CVIVANTE_NAMESPACE("cl.crop_and_resize_"#RESIZE_METHOD"_"#IN_DTYPE"to"#OUT_DTYPE), \
+          _CROP_AND_RESIZE_KERNEL_SOURCE_NAME#RESIZE_METHOD }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _crop_and_resize_kernel_map[] =
+{
+    // Register kernel here
+    CROP_AND_RESIZE_KERNEL( U32, U32, nearest_neighbor ),
+    CROP_AND_RESIZE_KERNEL( U32, F32, nearest_neighbor ),
+    CROP_AND_RESIZE_KERNEL( F32, F32, nearest_neighbor),
+    CROP_AND_RESIZE_KERNEL( F32, U32, nearest_neighbor ),
+    CROP_AND_RESIZE_KERNEL( F32, I32, nearest_neighbor),
+    CROP_AND_RESIZE_KERNEL( I32, I32, nearest_neighbor ),
+    CROP_AND_RESIZE_KERNEL( I32, F32, nearest_neighbor),
+
+    CROP_AND_RESIZE_KERNEL( U32, U32, bilinear),
+    CROP_AND_RESIZE_KERNEL( U32, F32, bilinear),
+    CROP_AND_RESIZE_KERNEL( F32, F32, bilinear),
+    CROP_AND_RESIZE_KERNEL( F32, U32, bilinear),
+    CROP_AND_RESIZE_KERNEL( F32, I32, bilinear),
+    CROP_AND_RESIZE_KERNEL( I32, I32, bilinear),
+    CROP_AND_RESIZE_KERNEL( I32, F32, bilinear),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _crop_and_resize_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _CROP_AND_RESIZE_PARAM_NUM  _cnt_of_array( _crop_and_resize_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_crop_and_resize_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+    };
+
+    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
+    int32_t       crop_width  = 0;
+    int32_t       crop_height = 0;
+    int32_t       image_width  = 0;
+    int32_t       image_height = 0;
+    int32_t       batch_out = 0;
+    float         width_scale = 0;
+    float         height_scale = 0;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &batch_out);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    image_width = (int32_t)(attr[0]->shape->data[0]);
+    image_height = (int32_t)(attr[0]->shape->data[1]);
+    crop_width = (int32_t)(attr[1]->shape->data[0]);
+    crop_height = (int32_t)(attr[1]->shape->data[1]);
+
+    width_scale = (crop_width > 1) ? (float)(image_width - 1) / (crop_width -1) : 0;
+    height_scale = (crop_height > 1) ? (float)(image_height - 1) / (crop_height -1) : 0;
+
+    gpu_param.global_scale[0] = 1;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+
+    gpu_param.global_size[0]   = (crop_width + gpu_param.global_scale[0] - 1)
+                                        / gpu_param.global_scale[0];
+    gpu_param.global_size[1]   = (crop_height + gpu_param.global_scale[1] - 1)
+                                        / gpu_param.global_scale[1];
+    gpu_param.global_size[2]   = (batch_out + gpu_param.global_scale[2] - 1)
+                                        / gpu_param.global_scale[2];
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+    status = vsi_nn_kernel_gpu_add_param( node, "width_scale", &width_scale );
+    status |= vsi_nn_kernel_gpu_add_param( node, "height_scale", &height_scale );
+    status |= vsi_nn_kernel_gpu_add_param( node, "image_width", &image_width );
+    status |= vsi_nn_kernel_gpu_add_param( node, "image_height", &image_height );
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+
+    return status;
+} /* _crop_and_resize_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t resize_method
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _crop_and_resize_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _crop_and_resize_kernel_map );
+    vx_param_description_t * param_def  = _crop_and_resize_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _crop_and_resize_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (F16 == in_dtype)
+    {
+        in_dtype = F32;
+    }
+    else if (U8 == in_dtype)
+    {
+        in_dtype = U32;
+    }
+    else if (I8 == in_dtype || I16 == in_dtype)
+    {
+        in_dtype = I32;
+    }
+
+    if (F16 == out_dtype)
+    {
+        out_dtype = F32;
+    }
+    else if (U8 == out_dtype)
+    {
+        out_dtype = U32;
+    }
+    else if (I8 == out_dtype || I16 == out_dtype)
+    {
+        out_dtype = I32;
+    }
+
+    key = CROP_AND_RESIZE_HASH_KEY( in_dtype, out_dtype, resize_method );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _crop_and_resize_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CROP_AND_RESIZE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}};
+    uint32_t ori_depth = (uint32_t)inputs[0]->attr.size[2];
+    uint32_t ori_batchout = (uint32_t)outputs[0]->attr.size[3];
+    float input_zp     = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float input_scale  = vsi_nn_get_tensor_scale(inputs[0]);
+    float output_zp    = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float output_scale = vsi_nn_get_tensor_scale(outputs[0]);
+    float inOutScale   = input_scale / output_scale;
+    float inOutTile    = output_zp - inOutScale * input_zp;
+
+    float extrapolation_value = vsi_nn_kernel_param_get_float32( params, "extrapolation_value" );
+    int32_t resize_method = vsi_nn_kernel_param_get_int32( params, "resize_method" );
+
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    shapes[0][0] = inputs[0]->attr.size[0];
+    shapes[0][1] = inputs[0]->attr.size[1];
+    shapes[0][2] = inputs[0]->attr.size[2] * inputs[0]->attr.size[3];
+
+    shapes[1][0] = outputs[0]->attr.size[0];
+    shapes[1][1] = outputs[0]->attr.size[1];
+    shapes[1][2] = outputs[0]->attr.size[2] * outputs[0]->attr.size[3];
+
+    rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], 3 );
+    rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[1], 3 );
+
+    if (rs_input == NULL || rs_output == NULL)
+    {
+        goto final;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, resize_method );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            node_params[0] = rs_input;
+            node_params[1] = (vsi_nn_kernel_node_param_t)(inputs[1]->t);
+            node_params[2] = (vsi_nn_kernel_node_param_t)(inputs[2]->t);
+            node_params[3] = rs_output;
+            node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &ori_depth );
+            node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &ori_batchout );
+            node_params[6] = vsi_nn_kernel_scalar_create( graph, F32, &inOutScale );
+            node_params[7] = vsi_nn_kernel_scalar_create( graph, F32, &inOutTile );
+            node_params[8] = vsi_nn_kernel_scalar_create( graph, F32, &extrapolation_value );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CROP_AND_RESIZE_PARAM_NUM );
+            CHECK_STATUS(status);
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+        }
+    }
+final:
+    if (rs_input)
+    {
+        vsi_nn_kernel_tensor_release( &rs_input );
+    }
+    if (rs_output)
+    {
+        vsi_nn_kernel_tensor_release( &rs_output );
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( crop_and_resize, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c
index 94e79fe..d24dbde 100644
--- a/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/depth2space_internal_cl.c
@@ -22,7 +22,7 @@
 *
 *****************************************************************************/
 
-
+#if !(VX_DEPTH2SPACE_CRD_MODE_SUPPORT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -228,4 +228,4 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( depth2space_internal, _setup )
-
+#endif
diff --git a/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c b/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c
deleted file mode 100644
index 596aab5..0000000
--- a/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c
+++ /dev/null
@@ -1,300 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-
-/*
- * Define kernel meta.
- */
-typedef enum
-{
-    INTERNAL_KERNEL_DETECT_POST_BOX,
-} _internal_kernel_e;
-
-#define _DETECT_POST_BOX_KERNEL_SOURCE      "detect_post_box"
-
-#define STR(a) #a
-// Add kernel hashtable here
-#define DETECT_POST_BOX_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
-        ((IN0_DTYPE << 18) | ( IN1_DTYPE << 11 ) | ( OUT_DTYPE << 4))
-
-#define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
-        { DETECT_POST_BOX_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \
-        CVIVANTE_NAMESPACE("cl.detect_post_box_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
-        _DETECT_POST_BOX_KERNEL_SOURCE}
-
-typedef struct
-{
-    uint32_t key;
-    char * function_name;
-    const char * source_name;
-} _kernel_map_type;
-
-static const _kernel_map_type _detect_post_box_kernel_map[] =
-{
-    // Register kernel here
-    PACK_KERNEL_MAP( F32, F32, F32 ),
-    PACK_KERNEL_MAP( U8,  U8, F32 ),
-};
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _detect_post_box_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _DETECT_POST_BOX_PARAM_NUM  _cnt_of_array( _detect_post_box_kernel_param_def )
-
-#define _DETECT_POST_BOX_F32_PARAM_NUM 8
-
-#define SCALAR_SCALE_Y   (3)
-#define SCALAR_SCALE_X   (4)
-#define SCALAR_SCALE_H   (5)
-#define SCALAR_SCALE_W   (6)
-#define SCALAR_LOG_E     (7)
-#define SCALAR_TAIL0     (8)
-#define SCALAR_TAIL1     (9)
-#define SCALAR_SCALE0    (10)
-#define SCALAR_SCALE1    (11)
-
-/*
- * Kernel initializer
- */
-DEF_KERNEL_INITIALIZER(_detect_post_box_initializer)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    gpu_param_t gpu_param = {
-        3,
-        {0, 0, 0},
-        {0, 0, 0},
-        {0, 0, 0},
-        {0, 0, 0}
-        };
-    vsi_nn_kernel_tensor_attr_t * input_attr   = NULL;
-    vsi_size_array_t * in_shape                 = NULL;
-
-    VSI_UNREFERENCED(param_size);
-    VSI_UNREFERENCED(node);
-
-    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
-    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
-    in_shape  = input_attr->shape;
-
-    gpu_param.global_scale[0]  = 1;
-    gpu_param.global_scale[1]  = 1;
-    gpu_param.global_scale[2]  = 1;
-
-    gpu_param.dim = 2;
-    gpu_param.global_size[0] = (
-            (in_shape->data[1] + gpu_param.global_scale[0] - 1)
-            / gpu_param.global_scale[0]);
-    gpu_param.global_size[1] = (
-            (in_shape->data[2] + gpu_param.global_scale[1] - 1)
-            / gpu_param.global_scale[1]);
-    gpu_param.global_size[2] = 1;
-    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
-
-final:
-#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
-    SAFE_FREE_TENSOR_ATTR(input_attr);
-
-    return status;
-} /* _detect_post_box_initializer() */
-
-
-
-/*
- * Query kernel
- */
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs,
-    vsi_bool *is_use_u8_kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_dtype_e in0_dtype;
-    vsi_nn_kernel_dtype_e in1_dtype;
-    vsi_nn_kernel_dtype_e out_dtype;
-    const _kernel_map_type * kernel_map = _detect_post_box_kernel_map;
-    size_t kernel_map_size              = _cnt_of_array( _detect_post_box_kernel_map );
-    vx_param_description_t * param_def  = _detect_post_box_kernel_param_def;
-    size_t param_def_size               = _cnt_of_array( _detect_post_box_kernel_param_def );
-    vx_kernel_initialize_f  initializer = _detect_post_box_initializer;
-    uint32_t key;
-    uint32_t i;
-
-    in0_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
-    in1_dtype  = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
-    out_dtype  = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
-
-
-    if ((U8 == in0_dtype) && (U8 == in1_dtype))
-    {
-        *is_use_u8_kernel = TRUE;
-        param_def_size    = _DETECT_POST_BOX_PARAM_NUM;
-    }
-    else
-    {
-        *is_use_u8_kernel = FALSE;
-        param_def_size    = _DETECT_POST_BOX_F32_PARAM_NUM;
-    }
-
-    key = DETECT_POST_BOX_HASH_KEY( in0_dtype, in1_dtype, out_dtype );
-
-    for ( i = 0; i < kernel_map_size; i ++ )
-    {
-        if ( kernel_map[i].key == key )
-        {
-            break;
-        }
-    }
-    if ( i < kernel_map_size )
-    {
-        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
-        kernel->info.parameters  = param_def;
-        kernel->info.numParams   = (vx_uint32)param_def_size;
-        kernel->info.initialize  = initializer;
-        // Register code source
-        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
-                kernel_map[i].source_name );
-        // Register binary source
-        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
-                kernel_map[i].source_name );
-        status = VSI_SUCCESS;
-    }
-    return status;
-} /* _query_kernel() */
-
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_DETECT_POST_BOX_PARAM_NUM] = {NULL};
-    vsi_nn_kernel_node_t node  = NULL;
-    float                logE  = (float)(log10(exp(1.0f)) / log10(2.0f));
-    float         inv_scale_y  = vsi_nn_kernel_param_get_float32( params, "inv_scale_y" );
-    float         inv_scale_x  = vsi_nn_kernel_param_get_float32( params, "inv_scale_x" );
-    float         inv_scale_h  = vsi_nn_kernel_param_get_float32( params, "inv_scale_h" );
-    float         inv_scale_w  = vsi_nn_kernel_param_get_float32( params, "inv_scale_w" );
-    vsi_bool      is_use_u8_kernel = FALSE;
-    float         input0Scale  = vsi_nn_get_tensor_scale(inputs[0]);
-    float         input0Zp     = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
-    float         input0Tail   = -input0Zp * input0Scale;
-    float         input1Scale  = vsi_nn_get_tensor_scale(inputs[1]);
-    float         input1Zp     = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
-    float         input1Tail   = -input1Zp * input1Scale;
-
-    status = _query_kernel( kernel, inputs, outputs, &is_use_u8_kernel );
-
-    if ( VSI_SUCCESS == status )
-    {
-        size_t node_params_num = _DETECT_POST_BOX_F32_PARAM_NUM;
-
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if ( node )
-        {
-            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _DETECT_POST_BOX_PARAM_NUM,
-                    inputs, input_num, outputs, output_num );
-            node_params[SCALAR_SCALE_Y] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_y );
-            node_params[SCALAR_SCALE_X] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_x );
-            node_params[SCALAR_SCALE_H] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_h );
-            node_params[SCALAR_SCALE_W] = vsi_nn_kernel_scalar_create( graph, F32, &inv_scale_w );
-            node_params[SCALAR_LOG_E]   = vsi_nn_kernel_scalar_create( graph, F32, &logE );
-            if (is_use_u8_kernel)
-            {
-                node_params[SCALAR_TAIL0]   = vsi_nn_kernel_scalar_create( graph, F32, &input0Tail );
-                node_params[SCALAR_TAIL1]   = vsi_nn_kernel_scalar_create( graph, F32, &input1Tail );
-                node_params[SCALAR_SCALE0]  = vsi_nn_kernel_scalar_create( graph, F32, &input0Scale );
-                node_params[SCALAR_SCALE1]  = vsi_nn_kernel_scalar_create( graph, F32, &input1Scale );
-                node_params_num = _DETECT_POST_BOX_PARAM_NUM;
-            }
-
-            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
-            VSI_ASSERT( status == VSI_SUCCESS );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_Y] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_X] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_H] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_W] );
-            vsi_nn_kernel_scalar_release( &node_params[SCALAR_LOG_E] );
-            if (is_use_u8_kernel)
-            {
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL0] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL1] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE0] );
-                vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE1] );
-            }
-        }
-    }
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CL( detect_post_box, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c b/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c
deleted file mode 100644
index c278d06..0000000
--- a/src/tim/vx/internal/src/kernel/cl/detect_post_nms_cl.c
+++ /dev/null
@@ -1,197 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_graph.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_error.h"
-#include "vsi_nn_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-__BEGIN_DECLS
-#if 0
-/*
- * Define kernel meta.
- */
-typedef enum
-{
-    INTERNAL_KERNEL_DETECT_POST_NMS,
-} _internal_kernel_e;
-
-#define _DETECT_POST_NMS_KERNEL_SOURCE      "detect_post_nms"
-#define _DETECT_POST_NMS_KERNEL_NAME        CVIVANTE_NAMESPACE("cl.detect_post_nms")
-
-// Add kernel hashtable here
-#define DETECT_POST_NMS_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
-        (( IN_DTYPE << 8 ) | ( OUT_DTYPE ))
-#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, SOURCE ) \
-        { DETECT_POST_NMS_HASH_KEY( IN_DTYPE, OUT_DTYPE ), _DETECT_POST_NMS_KERNEL_NAME, SOURCE }
-
-typedef struct
-{
-    uint32_t key;
-    char * function_name;
-    const char * source_name;
-} _kernel_map_type;
-
-static const _kernel_map_type _detect_post_nms_kernel_map[] =
-{
-    // Register kernel here
-    PACK_KERNEL_MAP( F32, F32, _DETECT_POST_NMS_KERNEL_SOURCE ),
-};
-
-
-/*
- * Kernel params
- */
-static vx_param_description_t _detect_post_nms_kernel_param_def[] =
-{
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
-};
-#define _DETECT_POST_NMS_PARAM_NUM  _cnt_of_array( _detect_post_nms_kernel_param_def )
-
-#define SCALAR_NMS_TYPE     (6)
-#define SCALAR_MAX_NUM      (7)
-#define SCALAR_MAX_CLASS    (8)
-#define SCALAR_MAX_DETECT   (9)
-#define SCALAR_SCORE_TH     (10)
-#define SCALAR_IOU_TH       (11)
-#define SCALAR_IS_BG        (12)
-
-/*
- * Kernel initializer
- */
-DEF_KERNEL_INITIALIZER(_detect_post_nms_initializer)
-    (
-    vsi_nn_kernel_node_t                node,
-    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
-    )
-{
-    vsi_status status = VSI_FAILURE;
-
-    return status;
-} /* _detect_post_nms_initializer() */
-
-
-
-/*
- * Query kernel
- */
-
-static vsi_status _query_kernel
-    (
-    vsi_nn_kernel_t * kernel,
-    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_dtype_e in_dtype;
-    vsi_nn_kernel_dtype_e out_dtype;
-    const _kernel_map_type * kernel_map = _detect_post_nms_kernel_map;
-    size_t kernel_map_size              = _cnt_of_array( _detect_post_nms_kernel_map );
-    vx_param_description_t * param_def  = _detect_post_nms_kernel_param_def;
-    size_t param_def_size               = _cnt_of_array( _detect_post_nms_kernel_param_def );
-    vx_kernel_initialize_f  initializer = _detect_post_nms_initializer;
-
-    uint32_t key;
-    uint32_t i;
-
-    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
-    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
-
-    key = DETECT_POST_NMS_HASH_KEY( in_dtype, out_dtype );
-
-    for ( i = 0; i < kernel_map_size; i++ )
-    {
-        if ( kernel_map[i].key == key )
-        {
-            break;
-        }
-    }
-    if ( i < kernel_map_size )
-    {
-        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
-        kernel->info.parameters  = param_def;
-        kernel->info.numParams   = param_def_size;
-        kernel->info.initialize  = initializer;
-        // Register code source
-        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
-                kernel_map[i].source_name );
-        // Register binary source
-        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
-                kernel_map[i].source_name );
-        status = VSI_SUCCESS;
-    }
-    return status;
-} /* _query_kernel() */
-#endif
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel
-    )
-{
-    vsi_nn_kernel_node_t node = NULL;
-
-    VSI_UNREFERENCED(graph);
-    VSI_UNREFERENCED(inputs);
-    VSI_UNREFERENCED(input_num);
-    VSI_UNREFERENCED(outputs);
-    VSI_UNREFERENCED(output_num);
-    VSI_UNREFERENCED(params);
-    VSI_UNREFERENCED(kernel);
-
-    return node;
-} /* _setup() */
-
-__END_DECLS
-
-REGISTER_BACKEND_CL( detect_post_nms, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
index c44010a..c34a1e4 100644
--- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
@@ -60,6 +60,7 @@ typedef enum
     UNARY_ATANH,
     UNARY_ACOSH,
     UNARY_INVERSE_SIGMOID,
+    UNARY_TAN,
 } unary_type_e;
 
 /*
@@ -108,6 +109,7 @@ typedef enum
 #define ATANH_OPERATION         atanh
 #define ACOSH_OPERATION         acosh
 #define INVERSE_SIGMOID_OPERATION inverse_sigmoid
+#define TAN_OPERATION           tan
 
 #define ADD_UNARY_SH_KERNELS(name) \
     TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, F32, F32) \
@@ -142,6 +144,7 @@ static const struct {
     ADD_UNARY_SH_KERNELS(ATANH)
     ADD_UNARY_SH_KERNELS(ACOSH)
     ADD_UNARY_SH_KERNELS(INVERSE_SIGMOID)
+    ADD_UNARY_SH_KERNELS(TAN)
 
     TENSOR_UNARY_KERNELS_3D(NEG_OPERATION, UNARY_NEG, I32, I32)
     TENSOR_UNARY_KERNELS_2D(NEG_OPERATION, UNARY_NEG, I32, I32)
@@ -166,6 +169,7 @@ static const struct {
 #undef ATANH_OPERATION
 #undef ACOSH_OPERATION
 #undef INVERSE_SIGMOID_OPERATION
+#undef TAN_OPERATION
 /*
  * Kernel params
  */
@@ -452,16 +456,22 @@ OnError:
     REGISTER_BACKEND_CL( KERNEL_NAME, _##KERNEL_NAME##_setup )
 
 
+#if !(VX_ACTIVATION_SIN_COS_VX_SUPPORT_EXT)
 REGISTER_ELTWISE_UNARY_BACKEND_CL( sin,          UNARY_SIN )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( cos,          UNARY_COS )
+#endif
+#if !(VX_ACTIVATION_EXP_VX_SUPPORT_EXT)
 REGISTER_ELTWISE_UNARY_BACKEND_CL( exp,          UNARY_EXP )
+#endif
 REGISTER_ELTWISE_UNARY_BACKEND_CL( log,          UNARY_LOG )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( neg,          UNARY_NEG )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_sigmoid, UNARY_HSIGMOID )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( mish,         UNARY_MISH )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( round,        UNARY_ROUND )
+#if !(VX_ACTIVATION_GELU_VX_SUPPORT_EXT)
 REGISTER_ELTWISE_UNARY_BACKEND_CL( gelu,         UNARY_GELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( hard_gelu,    UNARY_HGELU )
+#endif
 REGISTER_ELTWISE_UNARY_BACKEND_CL( selu,         UNARY_SELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( celu,         UNARY_CELU )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( rcp,          UNARY_RCP )
@@ -471,5 +481,6 @@ REGISTER_ELTWISE_UNARY_BACKEND_CL( atan,         UNARY_ATAN )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( atanh,        UNARY_ATANH )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( acosh,        UNARY_ACOSH )
 REGISTER_ELTWISE_UNARY_BACKEND_CL( inverse_sigmoid, UNARY_INVERSE_SIGMOID )
+REGISTER_ELTWISE_UNARY_BACKEND_CL( tan,          UNARY_TAN )
 
 __END_DECLS
diff --git a/src/tim/vx/internal/src/kernel/cl/gather_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_cl.c
index e6a6743..6694331 100644
--- a/src/tim/vx/internal/src/kernel/cl/gather_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c
@@ -22,7 +22,7 @@
 *
 *****************************************************************************/
 
-
+#if !(VX_TENSOR_GATHER_API_SUPPORT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -420,3 +420,4 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( gather, _setup )
+#endif
diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c
index 193f388..4ec8672 100644
--- a/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c
@@ -90,6 +90,8 @@ static vx_param_description_t _grucell_activation_z_h_kernel_param_def[] =
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     // Add kererl parameters here
 };
 #define _GRUCELL_ACTIVATION_Z_H_PARAM_NUM  _cnt_of_array( _grucell_activation_z_h_kernel_param_def )
@@ -97,6 +99,8 @@ static vx_param_description_t _grucell_activation_z_h_kernel_param_def[] =
 #define SCALAR_INPUT_TAIL       (8)
 #define SCALAR_OUTPUT_SCALE     (9)
 #define SCALAR_OUTPUT_ZP        (10)
+#define SCALAR_OUTPUT1_SCALE    (11)
+#define SCALAR_OUTPUT1_ZP       (12)
 /*
  * Kernel initializer
  */
@@ -244,6 +248,8 @@ static vsi_nn_kernel_node_t _setup
     float input_tail = -(float)vsi_nn_get_tensor_zero_point(inputs[GRUCELL_ACT_Z_H_HSTATE]) * input_scale;
     float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]);
     float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]);
+    float output_scale1 = 1.0f / vsi_nn_get_tensor_scale(outputs[GRUCELL_ACT_Z_H_OUT_HSTATE]);
+    float output_zp1 = (float)vsi_nn_get_tensor_zero_point(outputs[GRUCELL_ACT_Z_H_OUT_HSTATE]);
 
     if( activation != VSI_NN_ACT_TANH )
     {
@@ -268,11 +274,17 @@ static vsi_nn_kernel_node_t _setup
                     graph, F32, &output_scale );
             node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(
                     graph, F32, &output_zp );
+            node_params[SCALAR_OUTPUT1_SCALE] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &output_scale1 );
+            node_params[SCALAR_OUTPUT1_ZP] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &output_zp1 );
             status  = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_ACTIVATION_Z_H_PARAM_NUM );
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT1_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT1_ZP] );
         }
     }
     return node;
diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c
index a99f8b9..f88b6d9 100644
--- a/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c
@@ -46,6 +46,7 @@ typedef enum _grucell_nn_activation_type_e
 {
     SIGMOID = VSI_NN_ACT_SIGMOID,
     HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
+    RELU = VSI_NN_ACT_RELU,
 }grucell_nn_activation_type_e;
 
 #define _GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE      "grucell_reset_after_activation"
@@ -71,6 +72,9 @@ static const _kernel_map_type _grucell_reset_after_activation_kernel_map[] =
     PACK_KERNEL_MAP( U8,  F32, U8,  SIGMOID ),
     PACK_KERNEL_MAP( I32, F32, I32, SIGMOID ),
     PACK_KERNEL_MAP( F32, F32, F32, SIGMOID ),
+    PACK_KERNEL_MAP( U8,  F32, U8,  RELU ),
+    PACK_KERNEL_MAP( I32, F32, I32, RELU ),
+    PACK_KERNEL_MAP( F32, F32, F32, RELU ),
 };
 
 
diff --git a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
index a13ec2e..cecb25a 100644
--- a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
@@ -22,7 +22,7 @@
 *
 *****************************************************************************/
 
-
+#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -360,3 +360,4 @@ final:
 __END_DECLS
 
 REGISTER_BACKEND_CL( layer_norm, _setup )
+#endif
diff --git a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c
index f7089bf..a7bcaae 100644
--- a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c
@@ -22,6 +22,7 @@
 *
 *****************************************************************************/
 
+#if !(VX_LOGSOFTMAX_VX_SUPPORT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -34,6 +35,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 
 __BEGIN_DECLS
 
@@ -41,27 +43,30 @@ __BEGIN_DECLS
 /*
  * Define kernel meta.
  */
-#define HASH_LOG_SOFTMAX_KEY(_axis, _input_type, _output_type, _image_2d) \
-    ((_axis << 20) | (_input_type << 12) | (_output_type << 4) | (_image_2d))
+#define HASH_LOG_SOFTMAX_KEY(_axis, _input_type, _output_type, _image_2d, exceed_limit) \
+    ((_axis << 24) | (_input_type << 16) | (_output_type << 8) | (_image_2d << 4) | exceed_limit)
 
  #define VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(_axis) \
     "log_softmax_axis"#_axis
 
+ #define VSI_NN_GEN_LOG_SOFTMAX_EXCEED_KERNEL_SOURCE_NAME(_axis) \
+    "log_softmax_exceed_axis"#_axis
+
 #define HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, SRC0_TYPE, DST_TYPE) \
     CVIVANTE_NAMESPACE("cl.log_softmax_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE)
 
 #define TENSOR_LOG_SOFTMAX_KERNELS(AXIS, SRC0_TYPE, OUT_TYPE) \
-    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \
+    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0, 0), \
         HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
         VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
 
 #define TENSOR_LOG_SOFTMAX_FLOAT(AXIS, SRC0_TYPE, OUT_TYPE) \
-    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \
+    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0, 0), \
         HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, F32, F32), \
         VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
 
 #define TENSOR_LOG_SOFTMAX_BFLOAT(AXIS, SRC0_TYPE, OUT_TYPE) \
-    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0), \
+    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0, 0), \
         HASH_LOG_SOFTMAX_SH_KERNEL_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
         VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
 
@@ -69,20 +74,28 @@ __BEGIN_DECLS
     CVIVANTE_NAMESPACE("cl.log_softmax_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE"_2D")
 
 #define TENSOR_LOG_SOFTMAX_KERNELS_2D(AXIS, SRC0_TYPE, OUT_TYPE) \
-    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \
+    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1, 0), \
         HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
         VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
 
 #define TENSOR_LOG_SOFTMAX_FLOAT_2D(AXIS, SRC0_TYPE, OUT_TYPE) \
-    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \
+    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1, 0), \
         HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, F32, F32), \
         VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
 
 #define TENSOR_LOG_SOFTMAX_BFLOAT_2D(AXIS, SRC0_TYPE, OUT_TYPE) \
-    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1), \
+    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 1, 0), \
         HASH_LOG_SOFTMAX_SH_KERNEL_2D_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
         VSI_NN_GEN_LOG_SOFTMAX_KERNEL_SOURCE_NAME(AXIS) },
 
+#define HASH_LOG_SOFTMAX_EXCEED_SH_KERNEL_NAME(AXIS, SRC0_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("cl.log_softmax_exceed_axis"#AXIS"_"#SRC0_TYPE"to"#DST_TYPE)
+
+#define TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(AXIS, SRC0_TYPE, OUT_TYPE) \
+    {   HASH_LOG_SOFTMAX_KEY(AXIS, SRC0_TYPE, OUT_TYPE, 0, 1), \
+        HASH_LOG_SOFTMAX_EXCEED_SH_KERNEL_NAME(AXIS, SRC0_TYPE, OUT_TYPE), \
+        VSI_NN_GEN_LOG_SOFTMAX_EXCEED_KERNEL_SOURCE_NAME(AXIS) },
+
 static const struct {
         uint32_t key;
         char* function_name;
@@ -92,31 +105,31 @@ static const struct {
     TENSOR_LOG_SOFTMAX_FLOAT(0, F32, F32)
     TENSOR_LOG_SOFTMAX_FLOAT(1, F32, F32)
     TENSOR_LOG_SOFTMAX_FLOAT(2, F32, F32)
-    TENSOR_LOG_SOFTMAX_FLOAT(0, F16, F16)
-    TENSOR_LOG_SOFTMAX_FLOAT(1, F16, F16)
-    TENSOR_LOG_SOFTMAX_FLOAT(2, F16, F16)
     TENSOR_LOG_SOFTMAX_BFLOAT(0, BF16, BF16)
     TENSOR_LOG_SOFTMAX_BFLOAT(1, BF16, BF16)
     TENSOR_LOG_SOFTMAX_BFLOAT(2, BF16, BF16)
 
     TENSOR_LOG_SOFTMAX_FLOAT_2D(0, F32, F32)
     TENSOR_LOG_SOFTMAX_FLOAT_2D(1, F32, F32)
-    TENSOR_LOG_SOFTMAX_FLOAT_2D(0, F16, F16)
-    TENSOR_LOG_SOFTMAX_FLOAT_2D(1, F16, F16)
     TENSOR_LOG_SOFTMAX_BFLOAT_2D(0, BF16, BF16)
     TENSOR_LOG_SOFTMAX_BFLOAT_2D(1, BF16, BF16)
 
-    TENSOR_LOG_SOFTMAX_KERNELS(0, U8, U8)
-    TENSOR_LOG_SOFTMAX_KERNELS(1, U8, U8)
-    TENSOR_LOG_SOFTMAX_KERNELS(2, U8, U8)
     TENSOR_LOG_SOFTMAX_KERNELS(0, U8, U8)
     TENSOR_LOG_SOFTMAX_KERNELS(1, U8, U8)
     TENSOR_LOG_SOFTMAX_KERNELS(2, U8, U8)
 
     TENSOR_LOG_SOFTMAX_KERNELS_2D(0, U8, U8)
     TENSOR_LOG_SOFTMAX_KERNELS_2D(1, U8, U8)
-    TENSOR_LOG_SOFTMAX_KERNELS_2D(0, U8,  U8)
-    TENSOR_LOG_SOFTMAX_KERNELS_2D(1, U8,  U8)
+
+    TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(0, U8, U8)
+    TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(1, U8, U8)
+
+    TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(0, F32, F32)
+    TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(1, F32, F32)
+
+    TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(0, BF16, BF16)
+    TENSOR_LOG_SOFTMAX_EXCEED_KERNELS(1, BF16, BF16)
+
 };
 
 /*
@@ -198,12 +211,89 @@ final:
     return status;
 } /* _log_softmax_initializer() */
 
+DEF_KERNEL_INITIALIZER(_log_softmax_exceed_initializer)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        2,         // workdim
+        {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0}, // localWorkSize: local group size in thread
+        {0, 0, 0}  // globalWorkSize: image size in thread
+        };
+
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+    vsi_size_array_t * out_shape = NULL;
+    int32_t axis = 0;
+    int32_t width = 0;
+    int32_t height = 0;
+    int32_t depth = 0;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    out_shape  = attr[1]->shape;
+
+    width = (int32_t)(out_shape->data[0]);
+    height = (int32_t)(out_shape->data[1]);
+    depth = attr[1]->shape->size > 2 ? (int32_t)(out_shape->data[2]) : 1;
+    gpu_param.global_scale[0] = 1;
+    gpu_param.global_scale[1] = 1;
+    if (axis == 0)
+    {
+        gpu_param.global_size[0] = 1;
+        gpu_param.global_size[1] = depth;
+    }
+    else
+    {
+        gpu_param.global_size[0] = width;
+        gpu_param.global_size[1] = 1;
+    }
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    if (axis == 0)
+    {
+        status |= vsi_nn_kernel_gpu_add_param( node, "width", &width );
+    }
+    else
+    {
+        status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth );
+    }
+    status |= vsi_nn_kernel_gpu_add_param( node, "height", &height );
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+    }
+
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+    }
+
+    return status;
+}
+
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
     vsi_nn_tensor_t* const* const outputs,
     int32_t axis,
     vsi_bool image_2d,
+    vsi_bool exceed_limit,
     vsi_nn_kernel_t* kernel
     )
 {
@@ -215,7 +305,17 @@ static vsi_status _query_kernel
 
     input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
-    key = HASH_LOG_SOFTMAX_KEY( axis, input_dtype, output_dtype, image_2d );
+
+    if (input_dtype == F16)
+    {
+        input_dtype = F32;
+    }
+    if (output_dtype == F16)
+    {
+        output_dtype = F32;
+    }
+    if (exceed_limit) image_2d = vx_false_e;
+    key = HASH_LOG_SOFTMAX_KEY( axis, input_dtype, output_dtype, image_2d, exceed_limit );
 
     for( i = 0; i < _cnt_of_array(kernel_map); i ++ )
     {
@@ -229,7 +329,14 @@ static vsi_status _query_kernel
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
         kernel->info.parameters = kernel_param_def;
         kernel->info.numParams = _cnt_of_array( kernel_param_def );
-        kernel->info.initialize = _log_softmax_initializer;
+        if (exceed_limit)
+        {
+            kernel->info.initialize = _log_softmax_exceed_initializer;
+        }
+        else
+        {
+            kernel->info.initialize = _log_softmax_initializer;
+        }
         vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
                 kernel_map[i].source_name );
         vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
@@ -254,7 +361,14 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL};
     vsi_bool image_2d = FALSE;
     vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+    uint32_t rank_in = 0;
     int32_t axis = 0;
+    int32_t new_axis = 0;
+    vsi_bool ret = vx_false_e;
+    vsi_bool exceed_limit = vx_false_e;
+    uint32_t i   = 0;
     float beta = 0;
     float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
     float outputScale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
@@ -270,16 +384,37 @@ static vsi_nn_kernel_node_t _setup
     scaleValue = scaleValue * beta * inputScale;
     beta = beta * inputScale;
 
-    if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
-                inputs[0]->attr.dim_num )
-     || axis > 2)
+    if (inputs[0]->attr.size[axis] >= GPU_TENSOR_MAX_WIDTH)
+    {
+        exceed_limit = vx_true_e;
+    }
+
+    ret = vsi_nn_kernel_optimize_softmax_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
+            shapes[0], &rank_in, &new_axis);
+
+    if (ret)
+    {
+        reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+                inputs[0], shapes[0], rank_in );
+        reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+                outputs[0], shapes[0], rank_in );
+    }
+    else
     {
         return NULL;
     }
 
-    image_2d = ((inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1)
-        && axis != 2);
-    status = _query_kernel( inputs, outputs, axis, image_2d, kernel );
+    if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[0]->attr.size,
+                reshape_tensors[0]->attr.dim_num )
+     || new_axis > 2 || (new_axis == 2 && exceed_limit))
+    {
+        return NULL;
+    }
+
+    image_2d = ((reshape_tensors[0]->attr.dim_num == 2 || reshape_tensors[0]->attr.size[2] == 1)
+        && new_axis != 2);
+    status = _query_kernel( inputs, outputs, new_axis, image_2d, exceed_limit, kernel );
     if( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
@@ -287,10 +422,10 @@ static vsi_nn_kernel_node_t _setup
         if( node )
         {
             vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
-                    inputs, 1, outputs, 1 );
+                    reshape_tensors, 1, &reshape_tensors[1], 1 );
 
             node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &axis );
+                    graph, I32, &new_axis );
             node_params[SCALAR_INPUT_BETA] = vsi_nn_kernel_scalar_create(
                     graph, F32, &beta );
             node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create(
@@ -311,9 +446,16 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
         }
     }
+
+    for (i = 0; i < 2; i++)
+    {
+        vsi_safe_release_tensor(reshape_tensors[i]);
+    }
+
     return node;
 } /* _setup() */
 
 __END_DECLS
 
 REGISTER_BACKEND_CL( log_softmax, _setup )
+#endif
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
index ac342d3..f139ccb 100644
--- a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
@@ -75,6 +75,9 @@ __BEGIN_DECLS
 #define HASH_MATRIXMUL_4X_TRANSA_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
     CVIVANTE_NAMESPACE("cl.gemm_4x_transa_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)
 
+#define HASH_MATRIXMUL_4X_TRANSA_LOCAL_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE, IMAGE_DIM) \
+    CVIVANTE_NAMESPACE("cl.gemm_4x_transa_local_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE#IMAGE_DIM)
+
 #define TENSOR_MATRIXMUL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
     { HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 0, 0), \
         HASH_MATRIXMUL_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
@@ -90,6 +93,11 @@ __BEGIN_DECLS
         HASH_MATRIXMUL_4X_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
         SOURCE },
 
+#define TENSOR_MATRIXMUL_4X_TRANSA_LOCAL_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
+    {HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 2, 1, 0), \
+        HASH_MATRIXMUL_4X_TRANSA_LOCAL_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
+        SOURCE },
+
 #define TENSOR_MATRIXMUL_TRANSA_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, SOURCE) \
     {HASH_MATRIXMUL_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM, 0, 1, 0), \
         HASH_MATRIXMUL_TRANSA_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE, IMAGE_DIM), \
@@ -142,6 +150,7 @@ static const struct {
     TENSOR_MATRIXMUL_MERGE_KERNELS(F32, F32, F32, _3D,      KERNEL_SOURCE_3)
     TENSOR_MATRIXMUL_4X_KERNELS(F32, F32, F32, _2D,         KERNEL_SOURCE_4)
     TENSOR_MATRIXMUL_4X_TRANSA_KERNELS(F32, F32, F32, _2D,  KERNEL_SOURCE_4)
+    TENSOR_MATRIXMUL_4X_TRANSA_LOCAL_KERNELS(F32, F32, F32, _2D,  KERNEL_SOURCE_4)
 };
 
 /*
@@ -313,6 +322,49 @@ final:
     return status;
 } /* _matrixmul_4x_initializer() */
 
+DEF_KERNEL_INITIALIZER(_matrixmul_4x_local_initializer)
+(vsi_nn_kernel_node_t node,
+ const vsi_nn_kernel_node_param_t* param,
+ size_t param_size) {
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {3, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
+
+    vsi_nn_kernel_tensor_attr_t* attr = NULL;
+    vsi_size_t width = 0;
+
+
+    VSI_UNREFERENCED(param_size);
+
+    attr = vsi_nn_kernel_tensor_attr_create((vsi_nn_kernel_tensor_t)param[2]);
+    CHECK_PTR_FAIL_GOTO(attr, "Create tensor attr buffer fail.", final);
+
+    width = attr->shape->data[0];
+
+    gpu_param.dim = 2;
+    gpu_param.local_size[0] = 1;
+    gpu_param.local_size[1] = 64;
+    gpu_param.local_size[2] = 1;
+
+    gpu_param.global_scale[0] = 16;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+
+    gpu_param.global_size[0] =
+        (width + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0];
+    gpu_param.global_size[1] = 64;
+    gpu_param.global_size[2] = 1;
+
+    status = vsi_nn_kernel_gpu_config(node, &gpu_param);
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+final:
+    if (attr) {
+        vsi_nn_kernel_tensor_attr_release(&attr);
+        attr = NULL;
+    }
+    return status;
+} /* _matrixmul_4x_local_initializer() */
+
 static vsi_status _query_kernel
     (
     vsi_nn_kernel_t * kernel,
@@ -403,7 +455,10 @@ static vsi_status _query_kernel
             kernel->info.numParams = _cnt_of_array( _matrixmul_merge_kernel_param_def );
         }
 
-        if (flag_4x) {
+        if ((flag_4x == 2) && (transa == 1)) {
+            kernel->info.initialize = _matrixmul_4x_local_initializer;
+        }
+        else if (flag_4x == 1) {
             kernel->info.initialize = _matrixmul_4x_initializer;
         } else {
             kernel->info.initialize = _matrixmul_initializer;
@@ -471,6 +526,7 @@ static vsi_nn_kernel_node_t _setup
     uint32_t stride_axis_in_out[9] = {0};
     vsi_nn_tensor_t* tmp_inputs[2]  = {NULL};
     vsi_nn_tensor_t* tmp_outputs[1] = {NULL};
+    vsi_bool shader_cnt_support = FALSE;
 
     VSI_UNREFERENCED(input_num);
     VSI_UNREFERENCED(output_num);
@@ -585,7 +641,20 @@ static vsi_nn_kernel_node_t _setup
             rs_out_tensors = vsi_nn_reshape_tensor(graph, tmp_outputs[0], final_shape, final_rank);
             final_out_tensors[0] = rs_out_tensors;
 
-            flag_4x = 1;
+
+#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
+            shader_cnt_support =
+                (graph->ctx->config.subGroupSize >= 64 && graph->ctx->config.use_40bits_va) ? TRUE : FALSE;
+#endif
+            if ((in1_h % 64 == 0) && (transFlg == 1) && (out_h % 8 == 0) && shader_cnt_support)
+            {
+                flag_4x = 2;
+            }
+            else
+            {
+                flag_4x = 1;
+            }
+
         }
     }
 
diff --git a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c
index 3446fef..33bacb0 100644
--- a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c
@@ -246,28 +246,49 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL};
     vsi_bool image_2d = FALSE;
     vsi_nn_kernel_node_t node = NULL;
-
-    float input0Scale = vsi_nn_get_tensor_scale(inputs[0]);
-    float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0Scale;
-    float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
-    float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale;
-    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
-    float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float input0_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float input0_tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0_scale;
+    float input1_scale = vsi_nn_get_tensor_scale(inputs[1]);
+    float input1_tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1_scale;
+    float output_scale = vsi_nn_get_tensor_scale(outputs[0]);
+    float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+    vsi_size_t new_rank = 0;
+    vsi_bool ret = TRUE;
 
     VSI_UNREFERENCED(input_num);
     VSI_UNREFERENCED(output_num);
     VSI_UNREFERENCED(params);
 
-    outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
+    output_scale = vsi_abs(output_scale) < 1e-5 ? 0.0f : 1.0f / output_scale;
 
-    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
+    ret = vsi_nn_kernel_optimize_eltwise_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            inputs[1]->attr.size, inputs[1]->attr.dim_num,
+            outputs[0]->attr.size, outputs[0]->attr.dim_num,
+            shapes[0], shapes[1], shapes[2], &new_rank );
+
+    if (ret == FALSE)
     {
-        return NULL;
+        goto final;
     }
 
-    image_2d = (outputs[0]->attr.dim_num == 2);
-    status = _query_kernel( inputs, outputs, image_2d, kernel );
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+            inputs[0], shapes[0], new_rank );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+            inputs[1], shapes[1], new_rank );
+    reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
+            outputs[0], shapes[2], new_rank );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size,
+                reshape_tensors[2]->attr.dim_num ) )
+    {
+        goto final;
+    }
+
+    image_2d = (reshape_tensors[2]->attr.dim_num == 2);
+    status = _query_kernel( reshape_tensors, &reshape_tensors[2], image_2d, kernel );
     if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
@@ -275,19 +296,19 @@ static vsi_nn_kernel_node_t _setup
         if ( node )
         {
             vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
-                    inputs, 2, outputs, 1 );
+                    reshape_tensors, 2, &reshape_tensors[2], 1 );
             node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &input0Scale );
+                    graph, F32, &input0_scale );
             node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &input0Tail );
+                    graph, F32, &input0_tail );
             node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &input1Scale );
+                    graph, F32, &input1_scale );
             node_params[SCALAR_INPUT1_TAIL] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &input1Tail );
+                    graph, F32, &input1_tail );
             node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &outputScale );
+                    graph, F32, &output_scale );
             node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &outputZP );
+                    graph, F32, &output_zp );
 
             /* Pass parameters to node. */
             status  = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
@@ -300,6 +321,12 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
         }
     }
+
+final:
+    vsi_safe_release_tensor(reshape_tensors[0]);
+    vsi_safe_release_tensor(reshape_tensors[1]);
+    vsi_safe_release_tensor(reshape_tensors[2]);
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c
index 5d85656..4d607b6 100644
--- a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c
@@ -246,29 +246,49 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL};
     vsi_bool image_2d = FALSE;
     vsi_nn_kernel_node_t node = NULL;
-
-    float input0Scale = vsi_nn_get_tensor_scale(inputs[0]);
-    float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0Scale;
-    float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
-    float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale;
-    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
-    float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float input0_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float input0_tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0_scale;
+    float input1_scale = vsi_nn_get_tensor_scale(inputs[1]);
+    float input1_tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1_scale;
+    float output_scale = vsi_nn_get_tensor_scale(outputs[0]);
+    float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+    vsi_size_t new_rank = 0;
+    vsi_bool ret = TRUE;
 
     VSI_UNREFERENCED(input_num);
     VSI_UNREFERENCED(output_num);
     VSI_UNREFERENCED(params);
 
+    output_scale = vsi_abs(output_scale) < 1e-5 ? 0.0f : 1.0f / output_scale;
 
-    outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
+    ret = vsi_nn_kernel_optimize_eltwise_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            inputs[1]->attr.size, inputs[1]->attr.dim_num,
+            outputs[0]->attr.size, outputs[0]->attr.dim_num,
+            shapes[0], shapes[1], shapes[2], &new_rank );
 
-    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
+    if (ret == FALSE)
     {
-        return NULL;
+        goto final;
     }
 
-    image_2d = (outputs[0]->attr.dim_num == 2);
-    status = _query_kernel( inputs, outputs, image_2d, kernel );
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+            inputs[0], shapes[0], new_rank );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+            inputs[1], shapes[1], new_rank );
+    reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
+            outputs[0], shapes[2], new_rank );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size,
+                reshape_tensors[2]->attr.dim_num ) )
+    {
+        goto final;
+    }
+
+    image_2d = (reshape_tensors[2]->attr.dim_num == 2);
+    status = _query_kernel( reshape_tensors, &reshape_tensors[2], image_2d, kernel );
     if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
@@ -276,19 +296,19 @@ static vsi_nn_kernel_node_t _setup
         if ( node )
         {
             vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM,
-                    inputs, 2, outputs, 1 );
+                    reshape_tensors, 2, &reshape_tensors[2], 1 );
             node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &input0Scale );
+                    graph, F32, &input0_scale );
             node_params[SCALAR_INPUT0_TAIL] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &input0Tail );
+                    graph, F32, &input0_tail );
             node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &input1Scale );
+                    graph, F32, &input1_scale );
             node_params[SCALAR_INPUT1_TAIL] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &input1Tail );
+                    graph, F32, &input1_tail );
             node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &outputScale );
+                    graph, F32, &output_scale );
             node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(
-                    graph, F32, &outputZP );
+                    graph, F32, &output_zp );
 
             /* Pass parameters to node. */
             status  = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
@@ -301,6 +321,12 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
         }
     }
+
+final:
+    vsi_safe_release_tensor(reshape_tensors[0]);
+    vsi_safe_release_tensor(reshape_tensors[1]);
+    vsi_safe_release_tensor(reshape_tensors[2]);
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/cl/pow_cl.c b/src/tim/vx/internal/src/kernel/cl/pow_cl.c
index 6a38b4e..06e3652 100644
--- a/src/tim/vx/internal/src/kernel/cl/pow_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/pow_cl.c
@@ -22,6 +22,7 @@
 *
 *****************************************************************************/
 
+#if !(VX_TENSOR_POW_API_SUPPORT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -294,4 +295,4 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( pow, _setup )
-
+#endif
diff --git a/src/tim/vx/internal/src/kernel/cl/resize_cubic_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_cubic_cl.c
new file mode 100644
index 0000000..46d2977
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/resize_cubic_cl.c
@@ -0,0 +1,320 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+#define _RESIZE_CUBIC_KERNEL_SOURCE()      "resize_cubic"
+
+#define STR(a) #a
+// Add kernel hashtable here
+#define RESIZE_CUBIC_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
+        (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) )
+
+#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_CUBIC_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
+          CVIVANTE_NAMESPACE("cl.resize_cubic_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+          _RESIZE_CUBIC_KERNEL_SOURCE() }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _resize_cubic_kernel_map[] =
+{
+    PACK_KERNEL_MAP( F32, F32),
+    PACK_KERNEL_MAP( U8,  U8),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _resize_cubic_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+
+#define SCALAR_SCALE_X         (2)
+#define SCALAR_SCALE_Y         (3)
+#define SCALAR_HALF_PIXEL      (4)
+#define SCALAR_INPUT_SCALE     (5)
+#define SCALAR_INPUT_TAIL      (6)
+#define SCALAR_OUTPUT_SCALE    (7)
+#define SCALAR_OUTPUT_TAIL     (8)
+
+
+#define RESIZE_CUBIC_NUM         5
+#define RESIZE_CUBIC_QUANT_NUM   _cnt_of_array( _resize_cubic_kernel_param_def )
+
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_resize_cubic_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
+    vsi_size_array_t * out_shape                 = NULL;
+
+    VSI_UNREFERENCED(param_size);
+
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    out_shape  = output_attr->shape;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3;
+    gpu_param.global_size[0] = gpu_align_p2(
+            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if ( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(output_attr);
+    return status;
+} /* _resize_cubic_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    vsi_bool *is_use_u8_kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _resize_cubic_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _resize_cubic_kernel_map );
+    vx_param_description_t * param_def  = _resize_cubic_kernel_param_def;
+    size_t param_def_size               = _cnt_of_array( _resize_cubic_kernel_param_def );
+    vx_kernel_initialize_f  initializer = _resize_cubic_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (F16 == in_dtype)
+    {
+        in_dtype = F32;
+    }
+    if (F16 == out_dtype)
+    {
+        out_dtype = F32;
+    }
+
+    if ((U8 == in_dtype) || (U8 == out_dtype))
+    {
+        param_def_size = RESIZE_CUBIC_QUANT_NUM;
+        *is_use_u8_kernel = TRUE;
+    }
+    else
+    {
+        param_def_size = RESIZE_CUBIC_NUM;
+        *is_use_u8_kernel = FALSE;
+    }
+
+    key = RESIZE_CUBIC_HASH_KEY( in_dtype, out_dtype );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_def_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[RESIZE_CUBIC_QUANT_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
+    int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
+    vsi_size_t in_width     = inputs[0]->attr.size[0];
+    vsi_size_t in_height    = inputs[0]->attr.size[1];
+    vsi_size_t out_width    = outputs[0]->attr.size[0];
+    vsi_size_t out_height   = outputs[0]->attr.size[1];
+    float   input_zp     = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float   input_scale  = vsi_nn_get_tensor_scale(inputs[0]);
+    float   input_tail   = -(input_zp * input_scale);
+    float   output_zp    = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
+    float   half_pixel_value = 0.0f;
+    float   scale_factor_x = 0.0f;
+    float   scale_factor_y = 0.0f;
+    vsi_bool is_use_u8_kernel = FALSE;
+
+    if (align_corners && out_width > 1)
+    {
+        scale_factor_x = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1);
+    }
+    else
+    {
+        scale_factor_x = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width;
+    }
+
+    if (align_corners && out_height > 1)
+    {
+        scale_factor_y = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1);
+    }
+    else
+    {
+        scale_factor_y = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height;
+    }
+
+    if (half_pixel_centers)
+    {
+        half_pixel_value = 0.5f;
+    }
+    else
+    {
+        half_pixel_value = 0.0f;
+    }
+
+
+    status = _query_kernel( kernel, inputs, outputs, &is_use_u8_kernel );
+    if (VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            size_t node_params_num = RESIZE_CUBIC_NUM;
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, RESIZE_CUBIC_QUANT_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[SCALAR_SCALE_X]    = vsi_nn_kernel_scalar_create( graph, F32, &scale_factor_x );
+            node_params[SCALAR_SCALE_Y]    = vsi_nn_kernel_scalar_create(graph, F32, &scale_factor_y );
+            node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, F32, &half_pixel_value );
+            if (is_use_u8_kernel)
+            {
+                node_params[SCALAR_INPUT_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
+                node_params[SCALAR_INPUT_TAIL]   = vsi_nn_kernel_scalar_create(graph, F32, &input_tail );
+                node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
+                node_params[SCALAR_OUTPUT_TAIL]  = vsi_nn_kernel_scalar_create(graph, F32, &output_zp );
+                node_params_num = RESIZE_CUBIC_QUANT_NUM;
+            }
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
+            VSI_ASSERT( status == VSI_SUCCESS );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_X] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE_Y] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] );
+            if (is_use_u8_kernel)
+            {
+                vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
+                vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
+                vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
+                vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_TAIL] );
+            }
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( resize_cubic, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_reduction_cl.c b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_reduction_cl.c
new file mode 100644
index 0000000..299a2f6
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_reduction_cl.c
@@ -0,0 +1,727 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
+
+__BEGIN_DECLS
+
+typedef enum
+{
+    NONE = 0,
+    Add,
+    Mul,
+    Max,
+    Min
+} vsi_scatter_nd_update_type_e;
+
+/*
+ * Define kernel meta.
+ */
+#define KERNEL_SOURCE_1    "scatter_nd_update_reduction"
+#define KERNEL_SOURCE_2    "scatter_nd_update_reduction_conv"
+
+#define HASH_SCATTER_ND_UPDATE_KEY(_input0_type, _input2_type, _output_type, _stage, _op) \
+    ((_input0_type << 24) | (_input2_type << 16) | (_output_type << 8) | (_stage << 4) | (_op))
+
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PREPROCESS_NAME(SRC0_TYPE) \
+    CVIVANTE_NAMESPACE("cl.scatter_nd_update_reduction_preprocess_"#SRC0_TYPE)
+
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PROCESS_NAME(REDUCTION_TYPE, SRC2_TYPE) \
+    CVIVANTE_NAMESPACE("cl.scatter_nd_update_reduction_"#REDUCTION_TYPE"_"#SRC2_TYPE)
+
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_CONV_NAME(DST_TYPE) \
+    CVIVANTE_NAMESPACE("cl.scatter_nd_update_reduction_conv_"#DST_TYPE)
+
+#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(IN0_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, 0, 0, 0, 0), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_PREPROCESS_NAME(IN0_TYPE), \
+        SOURCE },
+
+#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(REDUCTION_TYPE, IN2_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(0, IN2_TYPE, 0, 1, REDUCTION_TYPE), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_PROCESS_NAME(REDUCTION_TYPE, IN2_TYPE), \
+        SOURCE },
+
+#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(OUT_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(0, 0, OUT_TYPE, 2, 0), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_CONV_NAME(OUT_TYPE), \
+        SOURCE },
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type scatter_nd_update_reduction_preprocess_map[] =
+{
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(U8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(I8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(I16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(F16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(F32,  KERNEL_SOURCE_1)
+};
+
+static const _kernel_map_type scatter_nd_update_reduction_process_map[] =
+{
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, U8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, U8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, U8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, U8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, I8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, I8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, I8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, I8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, I16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, I16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, I16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, I16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, F16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, F16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, F16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, F16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, F32,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, F32,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, F32,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, F32,  KERNEL_SOURCE_1)
+};
+
+static const _kernel_map_type scatter_nd_update_reduction_conv_map[] =
+{
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(U8,   KERNEL_SOURCE_2)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(I8,   KERNEL_SOURCE_2)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(I16,  KERNEL_SOURCE_2)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(F16,  KERNEL_SOURCE_2)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(F32,  KERNEL_SOURCE_2)
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _scatter_nd_update_preprocess_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+
+static vx_param_description_t _scatter_nd_update_process_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+
+static vx_param_description_t _scatter_nd_update_conv_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+
+#define _SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM  _cnt_of_array(_scatter_nd_update_preprocess_kernel_param_def)
+#define _SCATTER_ND_UPDATE_PROCESS_PARAM_NUM  _cnt_of_array(_scatter_nd_update_process_kernel_param_def)
+#define _SCATTER_ND_UPDATE_CONV_PARAM_NUM  _cnt_of_array(_scatter_nd_update_conv_kernel_param_def)
+
+static vsi_status cal_scatter_nd_update_tensor_reshape_size
+    (
+    vsi_nn_tensor_t ** inputs,
+    vsi_size_t sizes[VSI_NN_MAX_DIM_NUM],
+    uint32_t block_size,
+    uint32_t coordDim,
+    vsi_size_t strides[VSI_NN_MAX_DIM_NUM],
+    int32_t* newDim
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    uint32_t dims_num = inputs[0]->attr.dim_num;
+    vsi_size_t *input_size = inputs[0]->attr.size;
+    uint32_t i = 0;
+    vsi_size_t elementCnt = 1;
+
+#define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH
+
+    newDim[0] = 0;
+    for (i = 0; i < dims_num; ++i)
+    {
+        elementCnt *= input_size[i];
+    }
+
+    for (i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
+    {
+        sizes[i] = 1;
+    }
+
+    sizes[0] = block_size;
+    sizes[1] = elementCnt / block_size;
+    newDim[0] = 2;
+
+    if (coordDim == 1 && strides) // index shape
+    {
+        for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+        {
+            strides[i] = 0;
+        }
+    }
+    else if (coordDim >= 2 && coordDim <= VSI_NN_MAX_DIM_NUM && strides)
+    {
+        for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+        {
+            strides[i] = 0;
+        }
+
+        strides[0] = input_size[dims_num - coordDim];
+        for (i = 1; i < coordDim - 1; i++)
+        {
+            strides[i] = strides[i - 1] * input_size[dims_num - coordDim + i];
+        }
+    }
+
+#undef VSI_NN_MAX_IMAGE_WIDTH
+
+    return status;
+} /* cal_scatter_nd_update_tensor_reshape_size */
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_reduction_preprocess_initializer)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        1,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    int32_t     width         = 0;
+    int32_t     element_size  = 1;
+    int32_t     i             = 0;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+
+    for (i = 0; i < (int32_t)attr[0]->shape->size; i++)
+    {
+        element_size *= (int32_t)attr[0]->shape->data[i];
+    }
+    width = element_size / 8;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    if (element_size < 8)
+    {
+        gpu_param.global_size[0]   = element_size;
+    }
+    else
+    {
+        gpu_param.global_size[0]   = width;
+    }
+    gpu_param.global_size[1]   = 1;
+    gpu_param.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _scatter_nd_update_reduction_preprocess_initializer() */
+
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_process_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        2,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+    int32_t     block_size = 1;
+    int32_t     index_num  = 1;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+
+    block_size = (int32_t)(attr[1]->shape->data[0]);
+    index_num = (int32_t)(attr[0]->shape->data[1]);
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.global_size[0]   = block_size;
+    gpu_param.global_size[1]   = index_num;
+    gpu_param.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+    return status;
+} /* _scatter_nd_update_process_initializer() */
+
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_conv_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        1,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    int32_t     width         = 0;
+    int32_t     element_size  = 1;
+    int32_t     i             = 0;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+
+    for (i = 0; i < (int32_t)attr[0]->shape->size; i++)
+    {
+        element_size *= (int32_t)attr[0]->shape->data[i];
+    }
+    width = element_size / 8;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+    if (element_size < 8)
+    {
+        gpu_param.global_size[0]   = element_size;
+    }
+    else
+    {
+        gpu_param.global_size[0]   = width;
+    }
+    gpu_param.global_size[1]   = 1;
+    gpu_param.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _scatter_nd_update_conv_initializer() */
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel_preprocess,
+    vsi_nn_kernel_t* kernel_process,
+    vsi_nn_kernel_t* kernel_conv,
+    int32_t reduction_flg
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e input2_dtype = U8;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    uint32_t key = 0;
+    size_t i = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = HASH_SCATTER_ND_UPDATE_KEY( input0_dtype, 0, 0, 0, 0 );
+
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_preprocess_map); i ++ )
+    {
+        if ( scatter_nd_update_reduction_preprocess_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < _cnt_of_array(scatter_nd_update_reduction_preprocess_map) )
+    {
+        snprintf( kernel_preprocess->info.name, VX_MAX_KERNEL_NAME, "%s",
+                    scatter_nd_update_reduction_preprocess_map[i].function_name );
+        kernel_preprocess->info.parameters = _scatter_nd_update_preprocess_kernel_param_def;
+        kernel_preprocess->info.numParams = _cnt_of_array( _scatter_nd_update_preprocess_kernel_param_def );
+        kernel_preprocess->info.initialize = _scatter_nd_update_reduction_preprocess_initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                scatter_nd_update_reduction_preprocess_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                scatter_nd_update_reduction_preprocess_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    key = HASH_SCATTER_ND_UPDATE_KEY( 0, input2_dtype, 0, 1, reduction_flg );
+
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_process_map); i ++ )
+    {
+        if ( scatter_nd_update_reduction_process_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < _cnt_of_array(scatter_nd_update_reduction_process_map) )
+    {
+        snprintf( kernel_process->info.name, VX_MAX_KERNEL_NAME, "%s",
+                    scatter_nd_update_reduction_process_map[i].function_name );
+        kernel_process->info.parameters = _scatter_nd_update_process_kernel_param_def;
+        kernel_process->info.numParams = _cnt_of_array( _scatter_nd_update_process_kernel_param_def );
+        kernel_process->info.initialize = _scatter_nd_update_process_initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel_process, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                scatter_nd_update_reduction_process_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel_process, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                scatter_nd_update_reduction_process_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    key = HASH_SCATTER_ND_UPDATE_KEY( 0, 0, output_dtype, 2, 0 );
+
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_conv_map); i ++ )
+    {
+        if ( scatter_nd_update_reduction_conv_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < _cnt_of_array(scatter_nd_update_reduction_conv_map) )
+    {
+        snprintf( kernel_conv->info.name, VX_MAX_KERNEL_NAME, "%s",
+                    scatter_nd_update_reduction_conv_map[i].function_name );
+        kernel_conv->info.parameters = _scatter_nd_update_conv_kernel_param_def;
+        kernel_conv->info.numParams = _cnt_of_array( _scatter_nd_update_conv_kernel_param_def );
+        kernel_conv->info.initialize = _scatter_nd_update_conv_initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel_conv, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                scatter_nd_update_reduction_conv_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel_conv, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                scatter_nd_update_reduction_conv_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_size_t  shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
+    vsi_size_t  strides[VSI_NN_MAX_DIM_NUM] = {0};
+    int32_t coord_strides[VSI_NN_MAX_DIM_NUM] = {0};
+    int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" );
+    int32_t coord_dim  = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
+    int32_t reduction  = vsi_nn_kernel_param_get_int32( params, "reduction" );
+    int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
+    float input_zp   = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float input_scale  = vsi_nn_get_tensor_scale(inputs[0]);
+    float input_zp_scale = 0 - input_zp * input_scale;
+    float update_zp   = (float)vsi_nn_get_tensor_zero_point(inputs[2]);
+    float update_scale  = vsi_nn_get_tensor_scale(inputs[2]);
+    float update_zp_scale = 0 - update_zp * update_scale;
+    float output_zp    = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
+    vsi_nn_tensor_t * tensors[2] = { NULL };
+    vsi_nn_kernel_t * ikernels[2] = { NULL };
+    int32_t i = 0;
+
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = cal_scatter_nd_update_tensor_reshape_size(&inputs[1], shapes[0], coord_dim, 0,
+                                                    NULL, &rs_idx_dim);
+    status |= cal_scatter_nd_update_tensor_reshape_size(&inputs[2], shapes[1], block_size, 0,
+                                                    NULL, &rs_in_dim);
+    status |= cal_scatter_nd_update_tensor_reshape_size(&outputs[0], shapes[2], block_size, coord_dim,
+                                                    strides, &rs_out_dim);
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+    coord_strides[coord_dim - 1] = 1;
+    for (i = 0; i < coord_dim - 1; i++)
+    {
+        coord_strides[i] = (int32_t)strides[coord_dim - 2 - i];
+    }
+
+    {
+        vsi_nn_tensor_attr_t attr;
+        vsi_nn_kernel_node_t preprocess_node = NULL;
+        vsi_nn_kernel_node_t process_node = NULL;
+        vsi_nn_kernel_node_param_t preprocess_params[_SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM] = { NULL };
+        vsi_nn_kernel_node_param_t process_params[_SCATTER_ND_UPDATE_PROCESS_PARAM_NUM] = { NULL };
+        vsi_nn_kernel_node_param_t conv_params[_SCATTER_ND_UPDATE_CONV_PARAM_NUM] = { NULL };
+        int32_t width = 1;
+        int32_t res = 0;
+        int32_t update_width = (int32_t)shapes[1][0];
+        int32_t output_width = (int32_t)shapes[2][0];
+
+        ikernels[0] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_CL );
+        ikernels[0]->unique_id = kernel->unique_id;
+        ikernels[1] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_CL );
+        ikernels[1]->unique_id = kernel->unique_id;
+
+        memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
+        attr.dtype = outputs[0]->attr.dtype;
+        attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+        attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+        attr.is_const = FALSE;
+        attr.vtl = TRUE;
+
+        for (i = 0; i < rs_out_dim; i++)
+        {
+            attr.size[i] = shapes[2][i];
+            width *= (int32_t)shapes[2][i];
+        }
+        attr.dim_num = rs_out_dim;
+
+        res = width % 8;
+        width = (width >> 3) << 3;
+
+        tensors[0] = vsi_nn_CreateTensor( graph, &attr );  // ref'
+        attr.size[0] = 1;
+        attr.size[1] = 1;
+        attr.dim_num = rs_out_dim;
+        tensors[1] = vsi_nn_CreateTensor( graph, &attr );  // link_buffer0
+
+        status = _query_kernel( inputs, outputs, ikernels[0], ikernels[1], kernel, reduction);
+        if ( VSI_SUCCESS == status)
+        {
+            // convert ref to float
+            preprocess_node = vsi_nn_kernel_create_node( graph, ikernels[0] );
+            if (preprocess_node)
+            {
+                uint32_t index = 0;
+                /* Pass parameters to node. */
+                preprocess_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t,  shapes[2], rs_out_dim );
+                preprocess_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
+                preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+                preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res );
+                preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_scale );
+                preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &input_zp_scale );
+                status = vsi_nn_kernel_node_pass_param( preprocess_node, preprocess_params,
+                            _SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM );
+                CHECK_STATUS(status);
+                vsi_nn_kernel_tensor_release( &preprocess_params[0] );
+                vsi_nn_kernel_scalar_release( &preprocess_params[2] );
+                vsi_nn_kernel_scalar_release( &preprocess_params[3] );
+                vsi_nn_kernel_scalar_release( &preprocess_params[4] );
+                vsi_nn_kernel_scalar_release( &preprocess_params[5] );
+            }
+
+            // update
+            process_node = vsi_nn_kernel_create_node( graph, ikernels[1] );
+            if (process_node)
+            {
+                uint32_t index = 0;
+                /* Pass parameters to node. */
+                process_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t,  shapes[0], rs_idx_dim );
+                process_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t,  shapes[1], rs_in_dim );
+                process_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
+                process_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[0] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[1] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[2] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[3] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[4] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[5] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_strides[6] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &update_width );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &output_width );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &update_scale );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &update_zp_scale );
+                status = vsi_nn_kernel_node_pass_param( process_node, process_params,
+                                _SCATTER_ND_UPDATE_PROCESS_PARAM_NUM );
+                CHECK_STATUS(status);
+                vsi_nn_kernel_tensor_release( &process_params[0] );
+                vsi_nn_kernel_tensor_release( &process_params[1] );
+                vsi_nn_kernel_scalar_release( &process_params[4] );
+                vsi_nn_kernel_scalar_release( &process_params[5] );
+                vsi_nn_kernel_scalar_release( &process_params[6] );
+                vsi_nn_kernel_scalar_release( &process_params[7] );
+                vsi_nn_kernel_scalar_release( &process_params[8] );
+                vsi_nn_kernel_scalar_release( &process_params[9] );
+                vsi_nn_kernel_scalar_release( &process_params[10] );
+                vsi_nn_kernel_scalar_release( &process_params[11] );
+                vsi_nn_kernel_scalar_release( &process_params[12] );
+                vsi_nn_kernel_scalar_release( &process_params[13] );
+                vsi_nn_kernel_scalar_release( &process_params[14] );
+                vsi_nn_kernel_scalar_release( &process_params[15] );
+            }
+
+            // convert float to output
+            node = vsi_nn_kernel_create_node( graph, kernel );
+            if ( node )
+            {
+                uint32_t index = 0;
+                /* Pass parameters to node. */
+                conv_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
+                conv_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
+                conv_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim );
+                conv_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+                conv_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res );
+                conv_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
+                conv_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
+                status = vsi_nn_kernel_node_pass_param( node, conv_params, _SCATTER_ND_UPDATE_CONV_PARAM_NUM );
+                CHECK_STATUS(status);
+                vsi_nn_kernel_tensor_release( &conv_params[2] );
+                vsi_nn_kernel_scalar_release( &conv_params[3] );
+                vsi_nn_kernel_scalar_release( &conv_params[4] );
+                vsi_nn_kernel_scalar_release( &conv_params[5] );
+                vsi_nn_kernel_scalar_release( &conv_params[6] );
+            }
+        }
+
+        if (preprocess_node) {vsi_nn_kernel_node_release( &preprocess_node );}
+        if (process_node) {vsi_nn_kernel_node_release( &process_node );}
+    }
+
+final:
+    if (ikernels[0])
+    {
+        vsi_nn_kernel_release(&ikernels[0]);
+    }
+    if (ikernels[1])
+    {
+        vsi_nn_kernel_release(&ikernels[1]);
+    }
+    vsi_safe_release_tensor(tensors[0]);
+    vsi_safe_release_tensor(tensors[1]);
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( scatter_nd_update_reduction, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cl/select_cl.c b/src/tim/vx/internal/src/kernel/cl/select_cl.c
index ab44901..d2634c4 100644
--- a/src/tim/vx/internal/src/kernel/cl/select_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/select_cl.c
@@ -22,6 +22,7 @@
 *
 *****************************************************************************/
 
+#if !(VX_TENSOR_SELECT_VX_SUPPORT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -359,3 +360,4 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( select, _setup )
+#endif
diff --git a/src/tim/vx/internal/src/kernel/cl/tile_cl.c b/src/tim/vx/internal/src/kernel/cl/tile_cl.c
index 266b8ed..8227a36 100644
--- a/src/tim/vx/internal/src/kernel/cl/tile_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/tile_cl.c
@@ -22,6 +22,7 @@
 *
 *****************************************************************************/
 
+#if !(VX_TENSOR_TILE_API_SUPPORT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -445,3 +446,4 @@ final:
 __END_DECLS
 
 REGISTER_BACKEND_CL( tile, _setup )
+#endif
diff --git a/src/tim/vx/internal/src/kernel/cl/topk_cl.c b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
index 3d68840..b8cdfd0 100644
--- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
@@ -438,7 +438,7 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }};
     int32_t width = (int32_t)block_size;
     int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
-    int32_t num_stages = (int32_t)ceil(log10(block_size / 2.0f) / log10(2.0f));
+    int32_t num_stages = (int32_t)vsi_nn_max(ceil(log10(block_size / 2.0f) / log10(2.0f)), 0);
     vsi_bool is_odd_even_sort = FALSE;
     size_t param_num = _TOPK_PARAM_NUM;
     float inputScale  = vsi_nn_get_tensor_scale(inputs[0]);
diff --git a/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c b/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c
index e1861a2..1fcd398 100644
--- a/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c
@@ -106,14 +106,12 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
     vsi_nn_kernel_dtype_e  output_dtype  = F16;
     vsi_nn_kernel_tensor_attr_t *input0_attr = NULL, *input1_attr = NULL, *output_attr = NULL;
     vsi_size_array_t             *input_shape  = NULL;
-    float   scaleIn     = 1.0f;
-    int32_t input_ZP    = 0;
-    float   scaleIn1    = 1.0f;
-    int32_t input_ZP1   = 0;
-    float   scaleOut    = 1.0f;
-    int32_t output_ZP   = 0;
-    int32_t fixpoint = 0, fixpoint1 = 0, fixpoint_out = 0;
-    float   inScale_dfp, inScale_dfp1;
+    float   scaleIn    = 1.0f;
+    int32_t input_ZP   = 0;
+    float   scaleIn1   = 1.0f;
+    int32_t input_ZP1  = 0;
+    float   scaleOut   = 1.0f;
+    int32_t output_ZP  = 0;
     float   eps        = 0.0f;
     float   rsEps      = 0.0f;
     float   dimRatio   = 0.0f;
@@ -135,80 +133,12 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
     rsEps    = (float)(1.0f / sqrtf(eps));
     dimRatio = (float)(1.0 / (input_shape->data[0]));
 
-
-    if ( VSI_NN_KERNEL_QUANT_DFP == input0_attr->quant )
-    {
-        fixpoint   = input0_attr->dfp.fl;
-    }
-    else if ( VSI_NN_KERNEL_QUANT_ASYMM == input0_attr->quant )
-    {
-        input_ZP   = input0_attr->asymm.zero_point;
-        scaleIn    = input0_attr->asymm.scale;
-    }
-    else
-    {
-        input_ZP   = 0;
-        scaleIn    = 1.0f;
-    }
-
-    //input1
-    if ( VSI_NN_KERNEL_QUANT_DFP == input1_attr->quant )
-    {
-        fixpoint1  = input1_attr->dfp.fl;
-    }
-    else if ( VSI_NN_KERNEL_QUANT_ASYMM == input1_attr->quant )
-    {
-        input_ZP1  = input1_attr->asymm.zero_point;
-        scaleIn1   = input1_attr->asymm.scale;
-    }
-    else
-    {
-        input_ZP1   = 0;
-        scaleIn1    = 1.0f;
-    }
-
-    //output
-    if ( VSI_NN_KERNEL_QUANT_DFP == output_attr->quant )
-    {
-        fixpoint_out = output_attr->dfp.fl;
-        if (fixpoint_out >= 0)
-        {
-            scaleOut = 1.0f / (vx_float32) ((int64_t)1 << fixpoint_out);
-        }
-        else
-        {
-            scaleOut = (vx_float32) ((int64_t)1 << -fixpoint_out);
-        }
-        output_ZP = 0;
-    }
-    else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
-    {
-        output_ZP  = output_attr->asymm.zero_point;
-        scaleOut   = output_attr->asymm.scale;
-    }
-    else
-    {
-        output_ZP   = 0;
-        scaleOut    = 1.0f;
-    }
-
-    if (fixpoint >= 0)
-    {
-        inScale_dfp = 1.0f / (vx_float32) ((int64_t)1 << fixpoint);
-    }
-    else
-    {
-        inScale_dfp = (vx_float32) ((int64_t)1 << -fixpoint);
-    }
-
-    if (fixpoint1 >= 0)
-    {
-        inScale_dfp1 = 1.0f / (vx_float32) ((int64_t)1 << fixpoint1);
-    }
-    else
-    {
-        inScale_dfp1 = (vx_float32) ((int64_t)1 << -fixpoint1);
-    }
+    scaleIn   = input0_attr->scale;
+    input_ZP  = input0_attr->zero_point;
+    scaleIn1  = input1_attr->scale;
+    input_ZP1 = input1_attr->zero_point;
+    scaleOut  = output_attr->scale;
+    output_ZP = output_attr->zero_point;
 
     gpu_param.global_offset[0] = 0;
     gpu_param.global_offset[1] = 0;
@@ -349,8 +279,8 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer)
                        &uniConvertInt16ScaleToFp32Fst_4x4);
         status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt16ScaleToFp32Sec_4x4",
                        &uniConvertInt16ScaleToFp32Sec_4x4);
-        status |= vsi_nn_kernel_gpu_add_param(node, "inScale_i16", &inScale_dfp);
-        status |= vsi_nn_kernel_gpu_add_param(node, "inScale1_i16", &inScale_dfp1);
+        status |= vsi_nn_kernel_gpu_add_param(node, "inScale_i16", &scaleIn);
+        status |= vsi_nn_kernel_gpu_add_param(node, "inScale1_i16", &scaleIn1);
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
     width   = (int32_t)input_shape->data[0];
diff --git a/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c b/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c
index 80a1b21..e189ce3 100644
--- a/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c
@@ -215,41 +215,11 @@ DEF_KERNEL_INITIALIZER(_batch_norm_initializer)
     output_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output);
     CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
 
-    if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = input_attr->dfp.fl;
-        if (fl > 0)
-        {
-            input_scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input_scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        input_scale = input_attr->asymm.scale;
-        input_tail  = 0 - input_scale * (float)input_attr->asymm.zero_point;
-    }
+    input_scale = input_attr->scale;
+    input_tail  = 0 - input_scale * (float)input_attr->zero_point;
 
-    if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = output_attr->dfp.fl;
-        if (fl > 0)
-        {
-            output_scale = (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            output_scale = 1.0f / (float)((int64_t)1 << -fl);
-        }
-    }
-    else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        output_scale = 1.0f / output_attr->asymm.scale;
-        output_zp    = (float)output_attr->asymm.zero_point;
-    }
+    output_scale = 1.0f / output_attr->scale;
+    output_zp    = (float)output_attr->zero_point;
 
     pack_key = _PACK_BATCH_NORM_KEY( input_attr->dtype, output_attr->dtype );
 
diff --git a/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c b/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c
index 553f8b7..a94c93c 100644
--- a/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c
@@ -121,23 +121,20 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
     vsi_nn_kernel_dtype_e output_dtype = F16;
 
     uint32_t depth = 0;
-    float half_input0_wh[2];
-    float add_float_value[2];
-    uint32_t in0_width;
-    uint32_t in0_height;
-    uint32_t out_width;
-    uint32_t out_height;
-    int32_t align_corners;
+    float half_input0_wh[2]  = {0};
+    float add_float_value[2] = {0};
+    uint32_t in0_width    = 0;
+    uint32_t in0_height   = 0;
+    uint32_t out_width    = 0;
+    uint32_t out_height   = 0;
+    int32_t align_corners = 0;
 
-    int32_t src0FixPointPos = 0;
-    int32_t src1FixPointPos = 0;
-    int32_t dstFixPointPos  = 0;
-    float   input0_scale    = 1.0;
-    int32_t input0ZP        = 0;
-    float   input1_scale    = 1.0;
-    int32_t input1ZP        = 0;
-    float   output_scale    = 1.0;
-    int32_t outputZP        = 0;
+    float   input0_scale  = 1.0;
+    int32_t input0ZP      = 0;
+    float   input1_scale  = 1.0;
+    int32_t input1ZP      = 0;
+    float   output_scale  = 1.0;
+    int32_t outputZP      = 0;
 
     VSI_UNREFERENCED(param_size);
 
@@ -165,54 +162,14 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
     input1_dtype = input_attr[1]->dtype;
     output_dtype = output_attr->dtype;
 
-    if (U8 == input0_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant) {
-        input0_scale = input_attr[0]->asymm.scale;
-        input0ZP     = input_attr[0]->asymm.zero_point;
-    } else if (VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant) {
-        src0FixPointPos = input_attr[0]->dfp.fl;
-        if (src0FixPointPos >= 0) {
-            input0_scale = 1.0f / (float)((int64_t)1 << src0FixPointPos);
-        } else if (src0FixPointPos < 0) {
-            input0_scale = (float)((int64_t)1 << -src0FixPointPos);
-        }
-        input0ZP = 0;
-    } else {
-        input0_scale = 1.0f;
-        input0ZP     = 0;
-    }
+    input0_scale = input_attr[0]->scale;
+    input0ZP     = input_attr[0]->zero_point;
 
-    if (U8 == input1_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr[1]->quant) {
-        input1_scale = input_attr[1]->asymm.scale;
-        input1ZP     = input_attr[1]->asymm.zero_point;
-    } else if (VSI_NN_KERNEL_QUANT_DFP == input_attr[1]->quant) {
-        src1FixPointPos = input_attr[1]->dfp.fl;
-        if (src1FixPointPos >= 0) {
-            input1_scale = 1.0f / (float)((int64_t)1 << src1FixPointPos);
-        } else if (src1FixPointPos < 0) {
-            input1_scale = (float)((int64_t)1 << -src1FixPointPos);
-        }
-        input1ZP = 0;
-    } else {
-        input1_scale = 1.0f;
-        input1ZP     = 0;
-    }
-
-    if (U8 == output_dtype && VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant) {
-        output_scale = output_attr->asymm.scale;
-        outputZP = output_attr->asymm.zero_point;
-    } else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant) {
-        dstFixPointPos = output_attr->dfp.fl;
-        if (dstFixPointPos >= 0) {
-            output_scale = (float)((int64_t)1 << dstFixPointPos);
-        } else if (dstFixPointPos < 0) {
-            output_scale = 1.0f / (float)((int64_t)1 << -dstFixPointPos);
-        }
-        outputZP = 0;
-    } else {
-        output_scale = 1.0;
-        outputZP = 0;
-    }
+    input1_scale = input_attr[1]->scale;
+    input1ZP     = input_attr[1]->zero_point;
 
+    output_scale = output_attr->scale;
+    outputZP     = output_attr->zero_point;
 
     in0_width  = (uint32_t)(in0_shape->data[0]);
     in0_height = (uint32_t)(in0_shape->data[1]);
@@ -496,7 +453,7 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
                     I16 == output_dtype)) ||
                    ((I8 == input0_dtype && I8 == input1_dtype &&
                      I8 == output_dtype))) {
-            float dfpScale = input0_scale * output_scale;
+            float dfpScale = input0_scale / output_scale;
             gpu_dp_inst_t uniDFPtoFp32_part0_4x4 = {{
                 0x01010101, // TCfg
                 0x00000000, // ASelt
diff --git a/src/tim/vx/internal/src/kernel/evis/clip_evis.c b/src/tim/vx/internal/src/kernel/evis/clip_evis.c
index 1218322..8aef159 100644
--- a/src/tim/vx/internal/src/kernel/evis/clip_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/clip_evis.c
@@ -179,7 +179,6 @@ DEF_KERNEL_INITIALIZER(_clip_initializer)
             / gpu_param.global_scale[1]);
     gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
 
-
     if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
     {
         srcFixPointPos   = input_attr->dfp.fl;
diff --git a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c
index bc5e267..52d610c 100644
--- a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c
@@ -22,6 +22,7 @@
 *
 *****************************************************************************/
 
+#if !(VX_RELATIONAL_OPS_VX_SUPPORT_EXT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -319,41 +320,10 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer)
 
     out_shape  = attr[2]->shape;
 
-    if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[0]->dfp.fl;
-        if (fl > 0)
-        {
-            input0Scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input0Scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        input0Scale  = attr[0]->asymm.scale;
-        input0Tail = 0 - attr[0]->asymm.zero_point * input0Scale;
-    }
-
-    if( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[1]->dfp.fl;
-        if (fl > 0)
-        {
-            input1Scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input1Scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        input1Scale  = attr[1]->asymm.scale;
-        input1Tail = 0 - attr[1]->asymm.zero_point * input1Scale;
-    }
+    input0Scale = attr[0]->scale;
+    input0Tail  = 0 - attr[0]->zero_point * input0Scale;
+    input1Scale = attr[1]->scale;
+    input1Tail  = 0 - attr[1]->zero_point * input1Scale;
 
     gpu_param.global_scale[0]  = 8;
     gpu_param.global_scale[1]  = 1;
@@ -616,3 +586,4 @@ final:
 REGISTER_BACKEND_EVIS( relational_ops, _setup )
 
 __END_DECLS
+#endif
diff --git a/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c b/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c
index e5669b0..f4fec0b 100644
--- a/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c
@@ -152,23 +152,12 @@ DEF_KERNEL_INITIALIZER(_conv1d_ovxlib_initializer)
     out_shape    = output_attr->shape;
     weight_shape = weights_attr->shape;
 
-    if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
-    {
-        input_ZP          = input_attr->asymm.zero_point;
-        scaleIn           = input_attr->asymm.scale;
-    }
-
-    if ( VSI_NN_KERNEL_QUANT_ASYMM == weights_attr->quant )
-    {
-        weight_ZP         = weights_attr->asymm.zero_point;
-        scaleWights       = weights_attr->asymm.scale;
-    }
-
-    if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
-    {
-        output_ZP         = (float)output_attr->asymm.zero_point;
-        scaleOut          = output_attr->asymm.scale;
-    }
+    input_ZP     = input_attr->zero_point;
+    scaleIn      = input_attr->scale;
+    weight_ZP    = weights_attr->zero_point;
+    scaleWights  = weights_attr->scale;
+    output_ZP    = (float)output_attr->zero_point;
+    scaleOut     = output_attr->scale;
 
     scaleOut     = (scaleIn * scaleWights) / scaleOut;
     input_height = (int32_t)(in_shape->data[1]);
diff --git a/src/tim/vx/internal/src/kernel/evis/crop_and_resize_evis.c b/src/tim/vx/internal/src/kernel/evis/crop_and_resize_evis.c
new file mode 100644
index 0000000..012c040
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/crop_and_resize_evis.c
@@ -0,0 +1,540 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+#include "utils/vsi_nn_dtype_util.h"
+
+__BEGIN_DECLS
+
+typedef enum _crop_and_resize_type_e
+{
+    nearest_neighbor = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR,
+    bilinear = VSI_NN_INTERPOLATION_BILINEAR,
+}crop_and_resize_type_e;
+
+#define _CROP_AND_RESIZE_KERNEL_SOURCE_NAME      "crop_and_resize_"
+
+// Add kernel hashtable here
+#define CROP_AND_RESIZE_HASH_KEY( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ) \
+        (( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8) | (RESIZE_METHOD))
+#define CROP_AND_RESIZE_KERNEL( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ) \
+        { CROP_AND_RESIZE_HASH_KEY( IN_DTYPE, OUT_DTYPE, RESIZE_METHOD ), \
+          CVIVANTE_NAMESPACE("evis.crop_and_resize_"#RESIZE_METHOD"_"#IN_DTYPE"to"#OUT_DTYPE), \
+          _CROP_AND_RESIZE_KERNEL_SOURCE_NAME#RESIZE_METHOD }
+
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _crop_and_resize_kernel_map[] =
+{
+    // Register kernel here
+    CROP_AND_RESIZE_KERNEL( U8,  U8,  nearest_neighbor ),
+    CROP_AND_RESIZE_KERNEL( U8,  F16, nearest_neighbor ),
+    CROP_AND_RESIZE_KERNEL( F16, F16, nearest_neighbor),
+    CROP_AND_RESIZE_KERNEL( F16, U8,  nearest_neighbor ),
+    CROP_AND_RESIZE_KERNEL( F16, I8,  nearest_neighbor),
+    CROP_AND_RESIZE_KERNEL( I8,  I8,  nearest_neighbor ),
+    CROP_AND_RESIZE_KERNEL( I8,  F16, nearest_neighbor),
+    CROP_AND_RESIZE_KERNEL( I16, I16, nearest_neighbor ),
+    CROP_AND_RESIZE_KERNEL( I16, F16, nearest_neighbor),
+
+    CROP_AND_RESIZE_KERNEL( U8,  U8,  bilinear),
+    CROP_AND_RESIZE_KERNEL( U8,  F16, bilinear),
+    CROP_AND_RESIZE_KERNEL( F16, F16, bilinear),
+    CROP_AND_RESIZE_KERNEL( F16, U8,  bilinear),
+    CROP_AND_RESIZE_KERNEL( F16, I8,  bilinear),
+    CROP_AND_RESIZE_KERNEL( I8,  I8,  bilinear),
+    CROP_AND_RESIZE_KERNEL( I8,  F16, bilinear),
+    CROP_AND_RESIZE_KERNEL( I16, I16, bilinear),
+    CROP_AND_RESIZE_KERNEL( I16, F16, bilinear),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _crop_and_resize_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _CROP_AND_RESIZE_PARAM_NUM  _cnt_of_array( _crop_and_resize_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_crop_and_resize_nearest_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+    };
+
+    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
+    int32_t       crop_width  = 0;
+    int32_t       crop_height = 0;
+    int32_t       image_width  = 0;
+    int32_t       image_height = 0;
+    int32_t       batch_out = 0;
+    float         width_scale = 0;
+    float         height_scale = 0;
+    float         src0ZP     = 0;
+    float         src0Scale  = 1;
+    float         dstZP      = 0;
+    float         dstScale   = 1;
+    float         inOutScale = 0;
+    float         inOutTile  = 0;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &batch_out);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    src0Scale = attr[0]->scale;
+    src0ZP = (float)attr[0]->zero_point;
+
+    dstScale = attr[1]->scale;
+    dstZP = (float)attr[1]->zero_point;
+
+    inOutScale = src0Scale / dstScale;
+    inOutTile = dstZP - inOutScale * src0ZP;
+
+    image_width = (int32_t)(attr[0]->shape->data[0]);
+    image_height = (int32_t)(attr[0]->shape->data[1]);
+    crop_width = (int32_t)(attr[1]->shape->data[0]);
+    crop_height = (int32_t)(attr[1]->shape->data[1]);
+
+    width_scale = (crop_width > 1) ? (float)(image_width - 1) / (crop_width -1) : 0;
+    height_scale = (crop_height > 1) ? (float)(image_height - 1) / (crop_height -1) : 0;
+
+    gpu_param.global_scale[0] = 8;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+
+    gpu_param.global_size[0]   = gpu_align_p2((crop_width + gpu_param.global_scale[0] - 1)
+                                        / gpu_param.global_scale[0], 8);
+    gpu_param.global_size[1]   = (crop_height + gpu_param.global_scale[1] - 1)
+                                        / gpu_param.global_scale[1];
+    gpu_param.global_size[2]   = (batch_out + gpu_param.global_scale[2] - 1)
+                                        / gpu_param.global_scale[2];
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+    CHECK_STATUS_FAIL_GOTO(status, final);
+    {
+        gpu_dp_inst_t uniExtract8Bit_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniExtractHalf8_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertFstToFp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+        gpu_dp_inst_t uniConvertSecToFp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+            0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniConvertFstToFp32_4x4", &uniConvertFstToFp32_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniConvertSecToFp32_4x4", &uniConvertSecToFp32_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Bit_2x8", &uniExtract8Bit_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniExtractHalf8_2x8", &uniExtractHalf8_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "inOutScale", &inOutScale );
+        status |= vsi_nn_kernel_gpu_add_param( node, "inOutTile", &inOutTile );
+        status |= vsi_nn_kernel_gpu_add_param( node, "width_scale", &width_scale );
+        status |= vsi_nn_kernel_gpu_add_param( node, "height_scale", &height_scale );
+        status |= vsi_nn_kernel_gpu_add_param( node, "image_width", &image_width );
+        status |= vsi_nn_kernel_gpu_add_param( node, "image_height", &image_height );
+        CHECK_STATUS_FAIL_GOTO(status, final);
+    }
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+
+    return status;
+} /* _crop_and_resize_nearest_initializer() */
+
+DEF_KERNEL_INITIALIZER(_crop_and_resize_bilinear_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+    };
+
+    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
+    int32_t       crop_width  = 0;
+    int32_t       crop_height = 0;
+    int32_t       image_width  = 0;
+    int32_t       image_height = 0;
+    int32_t       batch_out = 0;
+    float         width_scale = 0;
+    float         height_scale = 0;
+    float         src0ZP     = 0;
+    float         src0Scale  = 1;
+    float         dstZP      = 0;
+    float         dstScale   = 1;
+    float         inOutScale = 0;
+    float         inOutTile  = 0;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &batch_out);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    src0Scale = attr[0]->scale;
+    src0ZP = (float)attr[0]->zero_point;
+
+    dstScale = attr[1]->scale;
+    dstZP = (float)attr[1]->zero_point;
+
+    inOutScale = src0Scale / dstScale;
+    inOutTile = dstZP - inOutScale * src0ZP;
+
+    image_width = (int32_t)(attr[0]->shape->data[0]);
+    image_height = (int32_t)(attr[0]->shape->data[1]);
+    crop_width = (int32_t)(attr[1]->shape->data[0]);
+    crop_height = (int32_t)(attr[1]->shape->data[1]);
+
+    width_scale = (crop_width > 1) ? (float)(image_width - 1) / (crop_width -1) : 0;
+    height_scale = (crop_height > 1) ? (float)(image_height - 1) / (crop_height -1) : 0;
+
+    gpu_param.global_scale[0] = 4;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+
+    gpu_param.global_size[0]   = gpu_align_p2((crop_width + gpu_param.global_scale[0] - 1)
+                                        / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1]   = (crop_height + gpu_param.global_scale[1] - 1)
+                                        / gpu_param.global_scale[1];
+    gpu_param.global_size[2]   = (batch_out + gpu_param.global_scale[2] - 1)
+                                        / gpu_param.global_scale[2];
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+    CHECK_STATUS_FAIL_GOTO(status, final);
+    {
+        gpu_dp_inst_t uniExtract8Bit_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniExtractHalf8_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniRightToFp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00030001, 0x00070005, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniLeftToFp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00020000, 0x00060004, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000, 0x00000001,
+            0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniRightToFp32_4x4", &uniRightToFp32_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniLeftToFp32_4x4", &uniLeftToFp32_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Bit_2x8", &uniExtract8Bit_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniExtractHalf8_2x8", &uniExtractHalf8_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "inOutScale", &inOutScale );
+        status |= vsi_nn_kernel_gpu_add_param( node, "inOutTile", &inOutTile );
+        status |= vsi_nn_kernel_gpu_add_param( node, "width_scale", &width_scale );
+        status |= vsi_nn_kernel_gpu_add_param( node, "height_scale", &height_scale );
+        status |= vsi_nn_kernel_gpu_add_param( node, "image_width", &image_width );
+        status |= vsi_nn_kernel_gpu_add_param( node, "image_height", &image_height );
+        CHECK_STATUS_FAIL_GOTO(status, final);
+    }
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+
+    return status;
+} /* _crop_and_resize_bilinear_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t resize_method
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _crop_and_resize_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _crop_and_resize_kernel_map );
+    vx_param_description_t * param_def  = _crop_and_resize_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _crop_and_resize_nearest_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (resize_method == bilinear)
+    {
+        initializer = _crop_and_resize_bilinear_initializer;
+    }
+    key = CROP_AND_RESIZE_HASH_KEY( in_dtype, out_dtype, resize_method );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _crop_and_resize_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CROP_AND_RESIZE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}};
+    uint32_t ori_depth = (uint32_t)inputs[0]->attr.size[2];
+    uint32_t ori_batchout = (uint32_t)outputs[0]->attr.size[3];
+    float extrapolation_value = vsi_nn_kernel_param_get_float32( params, "extrapolation_value" );
+    int32_t resize_method = vsi_nn_kernel_param_get_int32( params, "resize_method" );
+
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    shapes[0][0] = inputs[0]->attr.size[0];
+    shapes[0][1] = inputs[0]->attr.size[1];
+    shapes[0][2] = inputs[0]->attr.size[2] * inputs[0]->attr.size[3];
+
+    shapes[1][0] = outputs[0]->attr.size[0];
+    shapes[1][1] = outputs[0]->attr.size[1];
+    shapes[1][2] = outputs[0]->attr.size[2] * outputs[0]->attr.size[3];
+
+    rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], 3 );
+    rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[1], 3 );
+
+    if (rs_input == NULL || rs_output == NULL)
+    {
+        goto final;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, resize_method );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            node_params[0] = rs_input;
+            node_params[1] = (vsi_nn_kernel_node_param_t)(inputs[1]->t);
+            node_params[2] = (vsi_nn_kernel_node_param_t)(inputs[2]->t);
+            node_params[3] = rs_output;
+            node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &ori_depth );
+            node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &ori_batchout );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CROP_AND_RESIZE_PARAM_NUM );
+            CHECK_STATUS(status);
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+        }
+        {
+            // Set default border mode.
+            vx_border_t border;
+            border.mode = VX_BORDER_CONSTANT;
+            vsi_nn_Float32ToDtype(extrapolation_value, (uint8_t*)&border.constant_value.U32, &inputs[0]->attr.dtype);
+            status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
+            CHECK_STATUS(status);
+        }
+    }
+final:
+    if (rs_input)
+    {
+        vsi_nn_kernel_tensor_release( &rs_input );
+    }
+    if (rs_output)
+    {
+        vsi_nn_kernel_tensor_release( &rs_output );
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( crop_and_resize, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c
index 9ed9c08..4660e89 100644
--- a/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c
@@ -204,39 +204,11 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &reverse);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            input_scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            input_scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        input_scale = attr[0]->asymm.scale;
-        input_zp = attr[0]->asymm.zero_point;
-    }
+    input_scale = attr[0]->scale;
+    input_zp = attr[0]->zero_point;
 
-    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[1]->dfp.fl > 0)
-        {
-            output_scale = (float)((int64_t)1 << attr[1]->dfp.fl);
-        }
-        else
-        {
-            output_scale = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl));
-        }
-    }
-    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        output_scale = 1.0f / attr[1]->asymm.scale;
-        output_zp = (float)attr[1]->asymm.zero_point;
-    }
+    output_scale = 1.0f / attr[1]->scale;
+    output_zp = (float)attr[1]->zero_point;
 
     in_out_scale = input_scale * output_scale;
     in_out_zp_scale = (float)in_out_scale * input_zp * (-1);
diff --git a/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c
index 9d46462..b38d63c 100644
--- a/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c
@@ -22,7 +22,7 @@
 *
 *****************************************************************************/
 
-
+#if !(VX_DEPTH2SPACE_CRD_MODE_SUPPORT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -161,51 +161,10 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer)
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &block_size);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        src0ZP     = attr[0]->asymm.zero_point;
-        src0Scale  = attr[0]->asymm.scale;
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        src0ZP = 0;
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        src0Scale = 1;
-        src0ZP = 0;
-    }
-
-    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        dstZP      = attr[1]->asymm.zero_point;
-        dstScale   = attr[1]->asymm.scale;
-    }
-    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[1]->dfp.fl > 0)
-        {
-            dstScale = (1.0f / (float)((int64_t)1 << attr[1]->dfp.fl));
-        }
-        else
-        {
-            dstScale = (float)((int64_t)1 << -attr[1]->dfp.fl);
-        }
-        dstZP = 0;
-    }
-    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        dstScale = 1;
-        dstZP = 0;
-    }
+    src0ZP     = attr[0]->zero_point;
+    src0Scale  = attr[0]->scale;
+    dstZP      = attr[1]->zero_point;
+    dstScale   = attr[1]->scale;
 
     output_dims = (uint32_t)attr[1]->shape->size;
     output_width = (int32_t)(attr[1]->shape->data[0]);
@@ -454,4 +413,4 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( depth2space_internal, _setup )
-
+#endif
diff --git a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
index a2f10ce..0e4e1fe 100644
--- a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
@@ -250,12 +250,12 @@ DEF_KERNEL_INITIALIZER(_depthwise_conv1d_initializer)
     gpu_param.global_size[1]   = gpu_align_p2((output_shape->data[1] + gpu_param.global_scale[1] - 1)
                                              / gpu_param.global_scale[1], gpu_param.local_size[1]);
 
-    outputScale = input_attr->asymm.scale;
+    outputScale = input_attr->scale;
 
-    outputScale *= weight_attr->asymm.scale;
-    weightZP     = weight_attr->asymm.zero_point;
-    outputScale /= output_attr->asymm.scale;
-    outputZP = (float)output_attr->asymm.zero_point + 0.5f;
+    outputScale *= weight_attr->scale;
+    weightZP     = weight_attr->zero_point;
+    outputScale /= output_attr->scale;
+    outputZP = (float)output_attr->zero_point + 0.5f;
 
 #define _PACK_SELECT_KEY( kernel_size, dilation, evis_version )    \
         ((uint64_t)kernel_size | ((uint64_t)dilation << 16) | ((uint64_t)evis_version << 32))
diff --git a/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c b/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c
index aa781c8..05115ae 100644
--- a/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c
@@ -135,17 +135,10 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer)
     status  = vsi_nn_kernel_gpu_add_param( node, "logE", &logE);
     CHECK_STATUS_FAIL_GOTO(status, final );
 
-    if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
-    {
-        input0_ZP         = input_attr->asymm.zero_point;
-        scaleIn0          = input_attr->asymm.scale;
-    }
-
-    if ( VSI_NN_KERNEL_QUANT_ASYMM == input1_attr->quant )
-    {
-        input1_ZP         = input1_attr->asymm.zero_point;
-        scaleIn1          = input1_attr->asymm.scale;
-    }
+    input0_ZP = input_attr->zero_point;
+    scaleIn0  = input_attr->scale;
+    input1_ZP = input1_attr->zero_point;
+    scaleIn1  = input1_attr->scale;
 
     if ((F32 == input_attr->dtype) || (F32 == input1_attr->dtype))
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
index 5d383a1..be27bdd 100644
--- a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
@@ -60,6 +60,7 @@ typedef enum
     UNARY_ATANH,
     UNARY_ACOSH,
     UNARY_INVERSE_SIGMOID,
+    UNARY_TAN,
 } unary_type_e;
 
 /*
@@ -108,6 +109,7 @@ typedef enum
 #define ATANH_OPERATION         atanh
 #define ACOSH_OPERATION         acosh
 #define INVERSE_SIGMOID_OPERATION inverse_sigmoid
+#define TAN_OPERATION           tan
 
 #define ADD_UNARY_SH_KERNELS(name, source) \
     TENSOR_UNARY_KERNELS_3D(name##_OPERATION, UNARY_##name, BF16, BF16, source##_3D) \
@@ -153,6 +155,7 @@ static const struct {
     ADD_UNARY_SH_KERNELS(ATAN,      KERNEL_SOURCE1)
     ADD_UNARY_SH_KERNELS(ATANH,     KERNEL_SOURCE1)
     ADD_UNARY_SH_KERNELS(ACOSH,     KERNEL_SOURCE1)
+    ADD_UNARY_SH_KERNELS(TAN,       KERNEL_SOURCE1)
 
     ADD_UNARY_SH_KERNELS(HSIGMOID,  KERNEL_SOURCE0)
     ADD_UNARY_SH_KERNELS(MISH,      KERNEL_SOURCE0)
@@ -177,6 +180,7 @@ static const struct {
 #undef RCP_OPERATION
 #undef SIGN_OPERATION
 #undef SOFTSIGN_OPERATION
+#undef TAN_OPERATION
 /*
  * Kernel params
  */
@@ -243,41 +247,10 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
     }
     out_shape  = attr[1]->shape;
 
-    if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[0]->dfp.fl;
-        if (fl > 0)
-        {
-            inputScale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            inputScale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        inputScale  = attr[0]->asymm.scale;
-        inputTail = 0 - attr[0]->asymm.zero_point * inputScale;
-    }
-
-    if( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[1]->dfp.fl;
-        if (fl > 0)
-        {
-            outputScale = (float)((int64_t)1 << fl);
-        }
-        else
-        {
-            outputScale = (float)1.0f / (float) ((int64_t)1 << -fl);
-        }
-    }
-    else if( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        outputScale = (float)1.0f / attr[1]->asymm.scale;
-        outputZP     = (float)attr[1]->asymm.zero_point;
-    }
+    inputScale  = attr[0]->scale;
+    inputTail   = 0 - attr[0]->zero_point * inputScale;
+    outputScale = (float)1.0f / attr[1]->scale;
+    outputZP    = (float)attr[1]->zero_point;
 
 #define _PACK_SELECT_KEY( TYPE, IN_TYPE, OUT_TYPE )    \
         (( TYPE << 24) | ( IN_TYPE << 16) | ( OUT_TYPE << 8))
@@ -298,17 +271,23 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
 
     switch( pack_key )
     {
+#if !(VX_ACTIVATION_SIN_COS_VX_SUPPORT_EXT)
         case _PACK_SELECT_KEY( UNARY_SIN, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_COS, BF16, BF16 ):
+#endif
+#if !(VX_ACTIVATION_EXP_VX_SUPPORT_EXT)
         case _PACK_SELECT_KEY( UNARY_EXP, BF16, BF16 ):
+#endif
         case _PACK_SELECT_KEY( UNARY_LOG, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_SELU, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_NEG, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_HSIGMOID, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_MISH, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_ROUND, BF16, BF16 ):
+#if !(VX_ACTIVATION_GELU_VX_SUPPORT_EXT)
         case _PACK_SELECT_KEY( UNARY_GELU, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_HGELU, BF16, BF16 ):
+#endif
         case _PACK_SELECT_KEY( UNARY_CELU, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_RCP, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_SIGN, BF16, BF16 ):
@@ -317,6 +296,7 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
         case _PACK_SELECT_KEY( UNARY_ATANH, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_ACOSH, BF16, BF16 ):
         case _PACK_SELECT_KEY( UNARY_INVERSE_SIGMOID, BF16, BF16 ):
+        case _PACK_SELECT_KEY( UNARY_TAN, BF16, BF16 ):
         {
             gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
                 0x11111111, // TCfg
@@ -614,16 +594,22 @@ OnError:
     } \
     REGISTER_BACKEND_EVIS( KERNEL_NAME, _##KERNEL_NAME##_setup )
 
+#if !(VX_ACTIVATION_SIN_COS_VX_SUPPORT_EXT)
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( sin, UNARY_SIN )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( cos, UNARY_COS )
+#endif
+#if !(VX_ACTIVATION_EXP_VX_SUPPORT_EXT)
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( exp, UNARY_EXP )
+#endif
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( log, UNARY_LOG )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( neg, UNARY_NEG )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_sigmoid, UNARY_HSIGMOID )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( mish, UNARY_MISH )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( round, UNARY_ROUND )
+#if !(VX_ACTIVATION_GELU_VX_SUPPORT_EXT)
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( gelu, UNARY_GELU )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( hard_gelu, UNARY_HGELU )
+#endif
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( selu, UNARY_SELU )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( celu, UNARY_CELU )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( rcp, UNARY_RCP )
@@ -633,5 +619,6 @@ REGISTER_ELTWISE_UNARY_BACKEND_EVIS( atan, UNARY_ATAN )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( atanh, UNARY_ATANH )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( acosh, UNARY_ACOSH )
 REGISTER_ELTWISE_UNARY_BACKEND_EVIS( inverse_sigmoid, UNARY_INVERSE_SIGMOID )
+REGISTER_ELTWISE_UNARY_BACKEND_EVIS( tan, UNARY_TAN )
 
 __END_DECLS
diff --git a/src/tim/vx/internal/src/kernel/evis/erf_evis.c b/src/tim/vx/internal/src/kernel/evis/erf_evis.c
index ebc8ad8..22c8712 100644
--- a/src/tim/vx/internal/src/kernel/evis/erf_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/erf_evis.c
@@ -145,41 +145,10 @@ DEF_KERNEL_INITIALIZER(_erf_initializer)
 
     out_shape  = attr[1]->shape;
 
-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[0]->dfp.fl;
-        if (fl > 0)
-        {
-            inputScale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            inputScale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        inputScale  = attr[0]->asymm.scale;
-        inputTail = 0 - attr[0]->asymm.zero_point * inputScale;
-    }
-
-    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[1]->dfp.fl;
-        if (fl > 0)
-        {
-            outputScale = (float)((int64_t)1 << fl);
-        }
-        else
-        {
-            outputScale = (float)1.0f / (float) ((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        outputScale = (float)1.0f / attr[1]->asymm.scale;
-        outputZP     = (float)attr[1]->asymm.zero_point;
-    }
+    inputScale  = attr[0]->scale;
+    inputTail   = 0 - (float)attr[0]->zero_point * inputScale;
+    outputScale = (float)1.0f / attr[1]->scale;
+    outputZP    = (float)attr[1]->zero_point;
 
 #define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE )    \
         ( ( IN_TYPE << 16) | ( OUT_TYPE << 8))
diff --git a/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c b/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c
index 86d4d58..d12998d 100644
--- a/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c
@@ -129,9 +129,6 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer)
     vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
     vsi_size_array_t             *output_shape = NULL;
     vsi_nn_kernel_dtype_e        input0_dtype = F16;
-    int32_t                      input0_fl    = 0;
-    int32_t                      input1_fl    = 0;
-    int32_t                      output_fl    = 0;
     float                        inScale0     = 1.0f;
     float                        inScale1     = 1.0f;
     float                        outScale     = 1.0f;
@@ -169,59 +166,12 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer)
                                  (output_shape->data[2] + gpu_param.global_scale[2] - 1)
                                              / gpu_param.global_scale[2] : 1;
 
-    if( input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        input0_fl = input0_attr->dfp.fl;
-        if (input0_fl > 0)
-        {
-            inScale0 = 1.0f / (float) ((int64_t)1 << input0_fl);
-        }
-        else
-        {
-            inScale0 = (float)((int64_t)1 << -input0_fl);
-        }
-    }
-    else if( input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        inScale0   = input0_attr->asymm.scale;
-        in0Tail    = -inScale0 * ((float)input0_attr->asymm.zero_point);
-    }
-
-    if( input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        input1_fl = input1_attr->dfp.fl;
-        if (input1_fl > 0)
-        {
-            inScale1 = 1.0f / (float) ((int64_t)1 << input1_fl);
-        }
-        else
-        {
-            inScale1 = (float)((int64_t)1 << -input1_fl);
-        }
-    }
-    else if( input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        inScale1   = input1_attr->asymm.scale;
-        in1Tail    = -inScale1 * ((float)input1_attr->asymm.zero_point);
-    }
-
-    if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        output_fl = output_attr->dfp.fl;
-        if (output_fl > 0)
-        {
-            outScale = (float) ((int64_t)1 << output_fl);
-        }
-        else
-        {
-            outScale = 1.0f / (float)((int64_t)1 << -output_fl);
-        }
-    }
-    else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        outScale    = 1.0f / output_attr->asymm.scale;
-        outZp       = (float)(output_attr->asymm.zero_point);
-    }
+    inScale0 = input0_attr->scale;
+    in0Tail  = 0 - inScale0 * ((float)input0_attr->zero_point);
+    inScale1 = input1_attr->scale;
+    in1Tail  = 0 - inScale1 * ((float)input1_attr->zero_point);
+    outScale = 1.0f / output_attr->scale;
+    outZp    = (float)(output_attr->zero_point);
 
     if (BF16 == input0_dtype)
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/gather_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
index cf4411e..c61565c 100644
--- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
@@ -22,7 +22,7 @@
 *
 *****************************************************************************/
 
-
+#if !(VX_TENSOR_GATHER_API_SUPPORT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -202,6 +202,7 @@ static vx_param_description_t _gather_kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     // Add kererl parameters here
 };
 #define _GATHER_PARAM_NUM  _cnt_of_array( _gather_kernel_param_def )
@@ -285,6 +286,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
     int32_t       indices_num = 1;
     uint32_t      input_dims1 = 0;
     int32_t       batch       = 1;
+    int32_t       is_array    = 0;
     vx_uint32     i           = 0;
     vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
     vsi_size_array_t * input1_shape = NULL;
@@ -308,40 +310,13 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError );
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &block_num);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &is_array);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
 
-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        src0Scale = attr[0]->asymm.scale;
-        src0ZP = attr[0]->asymm.zero_point;
-    }
-
-    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[2]->dfp.fl > 0)
-        {
-            dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
-        }
-        else
-        {
-            dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
-        }
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        dstScale = 1.0f / attr[2]->asymm.scale;
-        dstZP = attr[2]->asymm.zero_point;
-    }
+    src0Scale = attr[0]->scale;
+    src0ZP    = attr[0]->zero_point;
+    dstScale  = 1.0f / attr[2]->scale;
+    dstZP     = attr[2]->zero_point;
 
     input1_shape  = attr[1]->shape;
     input_dims1   = (uint32_t)input1_shape->size;
@@ -358,8 +333,16 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
     }
     shaderParam.global_scale[1]  = 1;
     shaderParam.global_scale[2]  = 1;
-    shaderParam.global_size[0]   = gpu_align_p2((block_size + shaderParam.global_scale[0] - 1)
-        / shaderParam.global_scale[0], 4);
+    if (is_array)
+    {
+         shaderParam.global_size[0]   = (block_size + shaderParam.global_scale[0] - 1)
+             / shaderParam.global_scale[0];
+    }
+    else
+    {
+        shaderParam.global_size[0]   = gpu_align_p2((block_size + shaderParam.global_scale[0] - 1)
+             / shaderParam.global_scale[0], 4);
+    }
     shaderParam.global_size[1]   = indices_num;
     shaderParam.global_size[2]   = block_num;
 
@@ -508,39 +491,10 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &block_num);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        src0Scale = attr[0]->asymm.scale;
-        src0ZP = attr[0]->asymm.zero_point;
-    }
-
-    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[2]->dfp.fl > 0)
-        {
-            dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
-        }
-        else
-        {
-            dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
-        }
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        dstScale = 1.0f / attr[2]->asymm.scale;
-        dstZP = attr[2]->asymm.zero_point;
-    }
+    src0Scale = attr[0]->scale;
+    src0ZP    = attr[0]->zero_point;
+    dstScale  = 1.0f / attr[2]->scale;
+    dstZP     = attr[2]->zero_point;
 
     input1_shape  = attr[1]->shape;
     input_dims1   = (uint32_t)input1_shape->size;
@@ -661,8 +615,11 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer)
     {
         status |= vsi_nn_kernel_gpu_add_param(node, "batch", &batch);
     }
-    status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
-    status |= vsi_nn_kernel_gpu_add_param(node, "remainder", &remainder);
+    if (indices_num > GPU_TENSOR_MAX_WIDTH || block_num > GPU_TENSOR_MAX_WIDTH)
+    {
+        status |= vsi_nn_kernel_gpu_add_param(node, "width", &width);
+        status |= vsi_nn_kernel_gpu_add_param(node, "remainder", &remainder);
+    }
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
 OnError:
@@ -841,6 +798,7 @@ static vsi_nn_kernel_node_t _setup
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_size );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &block_num );
             tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &axis_num );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &is_array );
             status = vsi_nn_kernel_node_pass_param( node, tmp_params, _GATHER_PARAM_NUM );
             vsi_nn_kernel_scalar_release( &tmp_params[3] );
             vsi_nn_kernel_scalar_release( &tmp_params[4] );
@@ -859,3 +817,4 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( gather, _setup )
+#endif
diff --git a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
index 91c8f17..1d829fd 100644
--- a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
@@ -290,39 +290,10 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer)
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &block_size);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        src0Scale = attr[0]->asymm.scale;
-        src0ZP = attr[0]->asymm.zero_point;
-    }
-
-    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[2]->dfp.fl > 0)
-        {
-            dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
-        }
-        else
-        {
-            dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
-        }
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        dstScale = 1.0f / attr[2]->asymm.scale;
-        dstZP = attr[2]->asymm.zero_point;
-    }
+    src0Scale = attr[0]->scale;
+    src0ZP    = attr[0]->zero_point;
+    dstScale  = 1.0f / attr[2]->scale;
+    dstZP     = attr[2]->zero_point;
 
     indices_num = (int32_t)(attr[1]->shape->data[1]);
     batch_num = (int32_t)(attr[1]->shape->size > 2 ? attr[1]->shape->data[2] : 1);
diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c
index 631cfd9..3cf2829 100644
--- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c
@@ -238,7 +238,7 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_initializer)
     float    tensorZP[4]                    = {0.0f, 0.0f, 0.0f, 0.0f};
     uint32_t  i                             = 0;
     uint32_t  pack_key                      = 0;
-    vsi_size_array_t * output_shape          = NULL;
+    vsi_size_array_t * output_shape         = NULL;
     vsi_nn_kernel_tensor_attr_t * attr[4]   = { NULL, NULL, NULL, NULL };
 
     VSI_UNREFERENCED(param_size);
@@ -254,12 +254,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_initializer)
 
     for (i = 0; i < 4; i++)
     {
-        if( attr[i]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-            || attr[i]->quant == VSI_NN_KERNEL_QUANT_SYMM)
-        {
-            tensorZP[i]     = (float)attr[i]->asymm.zero_point;
-            tensorScale[i]  = attr[i]->asymm.scale;
-        }
+        tensorZP[i]     = (float)attr[i]->zero_point;
+        tensorScale[i]  = attr[i]->scale;
     }
 
     tensorZP[0] = tensorScale[0] * tensorZP[0];
@@ -459,63 +455,31 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_cdnn_initializer)
 
     output_shape  = attr[3]->shape;
 
-    if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-     || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM )
-    {
-        input_scale  = attr[0]->asymm.scale;
-        input_tail   = 0 - input_scale * (float)attr[0]->asymm.zero_point;
-    }
+    input_scale   = attr[0]->scale;
+    input_tail    = 0 - input_scale * (float)attr[0]->zero_point;
 
-    if( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-     || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM )
-    {
-        input_r_scale  = attr[1]->asymm.scale;
-        input_r_tail   = 0 - input_r_scale * (float)attr[1]->asymm.zero_point;
-    }
+    input_r_scale = attr[1]->scale;
+    input_r_tail  = 0 - input_r_scale * (float)attr[1]->zero_point;
 
-    if( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-     || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM )
-    {
-        recur_r_scale  = attr[2]->asymm.scale;
-        recur_r_tail   = 0 - recur_r_scale * (float)attr[2]->asymm.zero_point;
-    }
+    recur_r_scale = attr[2]->scale;
+    recur_r_tail  = 0 - recur_r_scale * (float)attr[2]->zero_point;
 
-    if( attr[3]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-     || attr[3]->quant == VSI_NN_KERNEL_QUANT_SYMM )
-    {
-        output_scale  = 1.0f / attr[3]->asymm.scale;
-        output_zp   = (float)attr[3]->asymm.zero_point;
-    }
+    output_scale  = 1.0f / attr[3]->scale;
+    output_zp     = (float)attr[3]->zero_point;
 
     if ( param_size == _GRUCELL_CDNN_SEP_ACTIVATION_PARAM_NUM )
     {
-        if( attr[4]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-         || attr[4]->quant == VSI_NN_KERNEL_QUANT_SYMM )
-        {
-            input_z_scale  = attr[4]->asymm.scale;
-            input_z_tail   = 0 - input_z_scale * (float)attr[4]->asymm.zero_point;
-        }
+        input_z_scale  = attr[4]->scale;
+        input_z_tail   = 0 - input_z_scale * (float)attr[4]->zero_point;
 
-        if( attr[5]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-         || attr[5]->quant == VSI_NN_KERNEL_QUANT_SYMM )
-        {
-            recur_z_scale  = attr[5]->asymm.scale;
-            recur_z_tail   = 0 - recur_z_scale * (float)attr[5]->asymm.zero_point;
-        }
+        recur_z_scale  = attr[5]->scale;
+        recur_z_tail   = 0 - recur_z_scale * (float)attr[5]->zero_point;
 
-        if( attr[6]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-         || attr[6]->quant == VSI_NN_KERNEL_QUANT_SYMM )
-        {
-            input_c_scale  = attr[6]->asymm.scale;
-            input_c_tail   = 0 - input_c_scale * (float)attr[6]->asymm.zero_point;
-        }
+        input_c_scale  = attr[6]->scale;
+        input_c_tail   = 0 - input_c_scale * (float)attr[6]->zero_point;
 
-        if( attr[5]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-         || attr[5]->quant == VSI_NN_KERNEL_QUANT_SYMM )
-        {
-            recur_c_scale  = attr[7]->asymm.scale;
-            recur_c_tail   = 0 - recur_c_scale * (float)attr[7]->asymm.zero_point;
-        }
+        recur_c_scale  = attr[7]->scale;
+        recur_c_tail   = 0 - recur_c_scale * (float)attr[7]->zero_point;
     }
 
     if (layer_out == 1 || layer_out == 2)
diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
index 63360b4..b4b7b61 100644
--- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
@@ -119,6 +119,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
     float                        hstate_in_tail         = 0;
     float                        output_scale           = 1.0f;
     float                        output_zp              = 0;
+    float                        output_scale1          = 1.0f;
+    float                        output_zp1             = 0;
     uint32_t                     i                      = 0;
     uint32_t                     pack_key               = 0;
     vsi_nn_kernel_tensor_attr_t* input_attr[GRUCELL_ACT_Z_H_IN_CNT] = {NULL};
@@ -142,33 +144,14 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
     output_attr[1] = vsi_nn_kernel_tensor_attr_create( hstate_out );
     CHECK_PTR_FAIL_GOTO( output_attr[1], "Create tensor attr buffer fail.", final );
 
-    if ( VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant )
-    {
-        int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl;
-        if (srcFixPointPos >= 0)
-            hstate_in_scale *= 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
-        else if (srcFixPointPos < 0)
-            hstate_in_scale *= (vx_float32)((int64_t)1 << -srcFixPointPos);
-    }
-    else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant )
-    {
-        hstate_in_scale = input_attr[0]->asymm.scale;
-        hstate_in_tail = -(float)input_attr[0]->asymm.zero_point * hstate_in_scale;
-    }
+    hstate_in_scale = input_attr[0]->scale;
+    hstate_in_tail = -(float)input_attr[0]->zero_point * hstate_in_scale;
 
-    if ( VSI_NN_KERNEL_QUANT_DFP == output_attr[0]->quant )
-    {
-        int8_t dstFixPointPos = (int8_t)output_attr[0]->dfp.fl;
-        if (dstFixPointPos >= 0)
-            output_scale *= (vx_float32)((int64_t)1 << dstFixPointPos);
-        else if (dstFixPointPos < 0)
-            output_scale *= 1.0f / (vx_float32) ((int64_t)1 << - dstFixPointPos);
-    }
-    else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr[0]->quant )
-    {
-        output_scale = 1.0f / output_attr[0]->asymm.scale;
-        output_zp = (float)output_attr[0]->asymm.zero_point;
-    }
+    output_scale = 1.0f / output_attr[0]->scale;
+    output_zp = (float)output_attr[0]->zero_point;
+
+    output_scale1 = 1.0f / output_attr[1]->scale;
+    output_zp1 = (float)output_attr[1]->zero_point;
 
     pack_key = _PACK_SELECT_KEY( input_attr[0]->dtype, input_attr[1]->dtype, output_attr[0]->dtype);
 
@@ -290,6 +273,8 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
             status |= vsi_nn_kernel_gpu_add_param(node, "hstate_in_tail", &hstate_in_tail);
             status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
             status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+            status |= vsi_nn_kernel_gpu_add_param(node, "output_scale1", &output_scale1);
+            status |= vsi_nn_kernel_gpu_add_param(node, "output_zp1", &output_zp1);
             CHECK_STATUS_FAIL_GOTO(status, final );
         }
         break;
diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c
index e3a2899..a4e885a 100644
--- a/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c
@@ -132,19 +132,8 @@ DEF_KERNEL_INITIALIZER(_grucell_h_times_activation_r_initializer)
     output_attr[0] = vsi_nn_kernel_tensor_attr_create( output );
     CHECK_PTR_FAIL_GOTO( output_attr[0], "Create tensor attr buffer fail.", final );
 
-    if ( VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant )
-    {
-        int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl;
-        if (srcFixPointPos >= 0)
-            hstate_in_scale *= 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
-        else if (srcFixPointPos < 0)
-            hstate_in_scale *= (vx_float32)((int64_t)1 << -srcFixPointPos);
-    }
-    else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant )
-    {
-        hstate_in_scale = input_attr[0]->asymm.scale;
-        hstate_in_tail = -(float)input_attr[0]->asymm.zero_point * hstate_in_scale;
-    }
+    hstate_in_scale = input_attr[0]->scale;
+    hstate_in_tail = 0 - (float)input_attr[0]->zero_point * hstate_in_scale;
 
     pack_key = _PACK_SELECT_KEY( input_attr[0]->dtype, input_attr[1]->dtype, output_attr[0]->dtype);
 
diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c
index f53a56a..e281d9a 100644
--- a/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c
@@ -47,6 +47,7 @@ typedef enum _grucell_nn_activation_type_e
     SIGMOID = VSI_NN_ACT_SIGMOID,
     HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
     TANH = VSI_NN_ACT_TANH,
+    RELU = VSI_NN_ACT_RELU,
 }grucell_nn_activation_type_e;
 
 #define _GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE      "grucell_reset_after_activation"
@@ -80,6 +81,11 @@ static const _kernel_map_type _grucell_reset_after_activation_kernel_map[] =
     PACK_KERNEL_MAP( I16,  F16,  I16,  SIGMOID, SIGMOID ),
     PACK_KERNEL_MAP( F16,  F16,  F16,  SIGMOID, SIGMOID ),
     PACK_KERNEL_MAP( BF16, BF16, BF16, SIGMOID, SIGMOID ),
+    PACK_KERNEL_MAP( U8,   F16,  U8,   SIGMOID, RELU ),
+    PACK_KERNEL_MAP( I8,   F16,  I8,   SIGMOID, RELU ),
+    PACK_KERNEL_MAP( I16,  F16,  I16,  SIGMOID, RELU ),
+    PACK_KERNEL_MAP( F16,  F16,  F16,  SIGMOID, RELU ),
+    PACK_KERNEL_MAP( BF16, BF16, BF16, SIGMOID, RELU ),
 };
 
 
@@ -148,33 +154,11 @@ DEF_KERNEL_INITIALIZER(_grucell_reset_after_activation_initializer)
     output_attr[1] = vsi_nn_kernel_tensor_attr_create( hstate_out );
     CHECK_PTR_FAIL_GOTO( output_attr[1], "Create tensor attr buffer fail.", final );
 
-    if ( VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant )
-    {
-        int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl;
-        if (srcFixPointPos >= 0)
-            hstate_in_scale *= 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
-        else if (srcFixPointPos < 0)
-            hstate_in_scale *= (vx_float32)((int64_t)1 << -srcFixPointPos);
-    }
-    else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant )
-    {
-        hstate_in_scale = input_attr[0]->asymm.scale;
-        hstate_in_tail = -(float)input_attr[0]->asymm.zero_point * hstate_in_scale;
-    }
+    hstate_in_scale = input_attr[0]->scale;
+    hstate_in_tail = -(float)input_attr[0]->zero_point * hstate_in_scale;
 
-    if ( VSI_NN_KERNEL_QUANT_DFP == output_attr[0]->quant )
-    {
-        int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl;
-        if (srcFixPointPos >= 0)
-            output_scale *= (vx_float32)((int64_t)1 << srcFixPointPos);
-        else if (srcFixPointPos < 0)
-            output_scale *= 1.0f / (vx_float32) ((int64_t)1 << -srcFixPointPos);
-    }
-    else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr[0]->quant )
-    {
-        output_scale = 1.0f / output_attr[0]->asymm.scale;
-        output_zp = (float)output_attr[0]->asymm.zero_point;
-    }
+    output_scale = 1.0f / output_attr[0]->scale;
+    output_zp = (float)output_attr[0]->zero_point;
 
     pack_key = _PACK_SELECT_KEY( input_attr[0]->dtype, input_attr[1]->dtype, output_attr[0]->dtype);
 
diff --git a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c
index 068257c..f55891f 100644
--- a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c
@@ -127,10 +127,8 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
     vsi_size_array_t * output_shape             = NULL;
     vsi_nn_kernel_dtype_e input_dtype          = F16;
     vsi_nn_kernel_dtype_e output_dtype         = F16;
-    int32_t   input_fl      = 0;
     int32_t   inputZP       = 0;
     float     inputScale    = 1.0f;
-    int32_t   output_fl     = 0;
     int32_t   outputZP      = 0;
     float     outputScale   = 1.0f;
     float     r_inputScale  = 1.0f;
@@ -153,41 +151,11 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer)
     input_dtype   = input_attr->dtype;
     output_dtype  = output_attr->dtype;
 
-    if ( VSI_NN_KERNEL_QUANT_DFP == input_attr->quant )
-    {
-        input_fl   = input_attr->dfp.fl;
-        if (input_fl >= 0)
-        {
-            inputScale = 1.0f / (float) ((int64_t)1 << input_fl);
-        }
-        else
-        {
-            inputScale = (float) ((int64_t)1 << -input_fl);
-        }
-    }
-    else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
-    {
-        inputZP     = input_attr->asymm.zero_point;
-        inputScale  = input_attr->asymm.scale;
-    }
+    inputZP      = input_attr->zero_point;
+    inputScale   = input_attr->scale;
 
-    if ( VSI_NN_KERNEL_QUANT_DFP == output_attr->quant )
-    {
-        output_fl   = output_attr->dfp.fl;
-        if (output_fl >= 0)
-        {
-            outputScale = (float) ((int64_t)1 << output_fl);
-        }
-        else
-        {
-            outputScale = 1.0f / (float) ((int64_t)1 << -output_fl);
-        }
-    }
-    else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
-    {
-        outputZP     = output_attr->asymm.zero_point;
-        outputScale  = 1.0f / output_attr->asymm.scale;
-    }
+    outputZP     = output_attr->zero_point;
+    outputScale  = 1.0f / output_attr->scale;
 
     e2InScale    = inputScale * inputScale;
     r_inputScale = 1.0f / inputScale;
diff --git a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
index 0a477c5..5ecb4b7 100644
--- a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
@@ -22,6 +22,7 @@
 *
 *****************************************************************************/
 
+#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -42,7 +43,11 @@ __BEGIN_DECLS
 #define SOURCE_AXIS0_1     "layer_normalization_1"
 #define SOURCE_AXIS0_2     "layer_normalization_2"
 #define SOURCE_AXIS0_3     "layer_normalization_3"
-#define SOURCE_AXIS01      "layer_normalization_axis01"
+#define SOURCE_AXIS01_SUM    "layer_normalization_axis01_sum"
+#define SOURCE_AXIS01_0      "layer_normalization_axis01_0"
+#define SOURCE_AXIS01_1      "layer_normalization_axis01_1"
+#define SOURCE_AXIS01_2      "layer_normalization_axis01_2"
+#define SOURCE_AXIS01_3      "layer_normalization_axis01_3"
 
 #define HASH_LAYERNORM_SH_KERNEL_NAME(SRC0_TYPE, SCALE_TYPE, DST_TYPE) \
     CVIVANTE_NAMESPACE("evis.layer_norm_axis0_"#SRC0_TYPE"_"#SCALE_TYPE"to"#DST_TYPE)
@@ -88,15 +93,15 @@ __BEGIN_DECLS
 #define HASH_LN_AXIS01_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
     CVIVANTE_NAMESPACE("evis.layernorm_axis01_"#SRC0_TYPE"_"#SRC1_TYPE"to"#DST_TYPE)
 
-#define LN_AXIS01_SUMS_KERNELS(IN0_TYPE, OUT_TYPE) \
+#define LN_AXIS01_SUMS_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
     {   HASH_LAYERNORM_KEY(IN0_TYPE, U4, OUT_TYPE, 0), \
         HASH_LN_AXIS01_SUMS_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
-        SOURCE_AXIS01 },
+        SOURCE },
 
-#define LAYERNORM_AXIS01_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE) \
+#define LAYERNORM_AXIS01_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
     { HASH_LAYERNORM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \
       HASH_LN_AXIS01_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
-      SOURCE_AXIS01 },
+      SOURCE },
 
 typedef struct
 {
@@ -159,32 +164,32 @@ static const _kernel_map_type _layernorm_kernel_map[] =
 static const _kernel_map_type _layernorm_axis01_kernel_map[] =
 {
     // Register kernel here
-    LN_AXIS01_SUMS_KERNELS( I8,  F32 )
-    LN_AXIS01_SUMS_KERNELS( U8,  F32 )
-    LN_AXIS01_SUMS_KERNELS( F16, F32 )
-    LN_AXIS01_SUMS_KERNELS( I16, F32 )
+    LN_AXIS01_SUMS_KERNELS( I8,  F32, SOURCE_AXIS01_SUM )
+    LN_AXIS01_SUMS_KERNELS( U8,  F32, SOURCE_AXIS01_SUM )
+    LN_AXIS01_SUMS_KERNELS( F16, F32, SOURCE_AXIS01_SUM )
+    LN_AXIS01_SUMS_KERNELS( I16, F32, SOURCE_AXIS01_SUM )
 
-    LAYERNORM_AXIS01_KERNELS( U8,  F16, U8 )
-    LAYERNORM_AXIS01_KERNELS( U8,  F16, F16 )
-    LAYERNORM_AXIS01_KERNELS( I8,  F16, I8 )
-    LAYERNORM_AXIS01_KERNELS( I8,  F16, F16 )
-    LAYERNORM_AXIS01_KERNELS( F16, F16, F16 )
-    LAYERNORM_AXIS01_KERNELS( F16, F16, I16 )
-    LAYERNORM_AXIS01_KERNELS( F16, F16, I8 )
-    LAYERNORM_AXIS01_KERNELS( F16, F16, U8 )
-    LAYERNORM_AXIS01_KERNELS( I16, F16, I16 )
-    LAYERNORM_AXIS01_KERNELS( I16, F16, F16 )
+    LAYERNORM_AXIS01_KERNELS( U8,  F16, U8,  SOURCE_AXIS01_0 )
+    LAYERNORM_AXIS01_KERNELS( U8,  F16, F16, SOURCE_AXIS01_0 )
+    LAYERNORM_AXIS01_KERNELS( I8,  F16, I8,  SOURCE_AXIS01_1 )
+    LAYERNORM_AXIS01_KERNELS( I8,  F16, F16, SOURCE_AXIS01_1 )
+    LAYERNORM_AXIS01_KERNELS( F16, F16, F16, SOURCE_AXIS01_2 )
+    LAYERNORM_AXIS01_KERNELS( F16, F16, I16, SOURCE_AXIS01_2 )
+    LAYERNORM_AXIS01_KERNELS( F16, F16, I8,  SOURCE_AXIS01_2 )
+    LAYERNORM_AXIS01_KERNELS( F16, F16, U8,  SOURCE_AXIS01_2 )
+    LAYERNORM_AXIS01_KERNELS( I16, F16, I16, SOURCE_AXIS01_3 )
+    LAYERNORM_AXIS01_KERNELS( I16, F16, F16, SOURCE_AXIS01_3 )
 
-    LAYERNORM_AXIS01_KERNELS( U8,  F32, U8 )
-    LAYERNORM_AXIS01_KERNELS( U8,  F32, F16 )
-    LAYERNORM_AXIS01_KERNELS( I8,  F32, I8 )
-    LAYERNORM_AXIS01_KERNELS( I8,  F32, F16 )
-    LAYERNORM_AXIS01_KERNELS( F16, F32, F16 )
-    LAYERNORM_AXIS01_KERNELS( F16, F32, I16 )
-    LAYERNORM_AXIS01_KERNELS( F16, F32, I8 )
-    LAYERNORM_AXIS01_KERNELS( F16, F32, U8 )
-    LAYERNORM_AXIS01_KERNELS( I16, F32, I16 )
-    LAYERNORM_AXIS01_KERNELS( I16, F32, F16 )
+    LAYERNORM_AXIS01_KERNELS( U8,  F32, U8,  SOURCE_AXIS01_0 )
+    LAYERNORM_AXIS01_KERNELS( U8,  F32, F16, SOURCE_AXIS01_0 )
+    LAYERNORM_AXIS01_KERNELS( I8,  F32, I8,  SOURCE_AXIS01_1 )
+    LAYERNORM_AXIS01_KERNELS( I8,  F32, F16, SOURCE_AXIS01_1 )
+    LAYERNORM_AXIS01_KERNELS( F16, F32, F16, SOURCE_AXIS01_2 )
+    LAYERNORM_AXIS01_KERNELS( F16, F32, I16, SOURCE_AXIS01_2 )
+    LAYERNORM_AXIS01_KERNELS( F16, F32, I8,  SOURCE_AXIS01_2 )
+    LAYERNORM_AXIS01_KERNELS( F16, F32, U8,  SOURCE_AXIS01_2 )
+    LAYERNORM_AXIS01_KERNELS( I16, F32, I16, SOURCE_AXIS01_3 )
+    LAYERNORM_AXIS01_KERNELS( I16, F32, F16, SOURCE_AXIS01_3 )
 
 };
 
@@ -1165,3 +1170,4 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( layer_norm, _setup )
+#endif
diff --git a/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c b/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c
index 4e7b8a0..37fddea 100644
--- a/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c
@@ -22,6 +22,7 @@
 *
 *****************************************************************************/
 
+#if !(VX_LOGSOFTMAX_VX_SUPPORT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -34,15 +35,21 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 
 __BEGIN_DECLS
 
 #define HASH_LOG_SOFTMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
     ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
 
+
  #define HASH_LOG_SOFTMAX_KERNEL_SOURCE_NAME(_suffix) \
     "log_softmax_axis"#_suffix
 
+ #define HASH_LOG_SOFTMAX_EXCEED_KERNEL_SOURCE_NAME(_suffix) \
+    "log_softmax_exceed_axis"#_suffix
+
+
 #define HASH_LOG_SOFTMAX_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, _suffix) \
         { HASH_LOG_SOFTMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \
         CVIVANTE_NAMESPACE("evis.log_softmax_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE), \
@@ -53,11 +60,18 @@ __BEGIN_DECLS
         CVIVANTE_NAMESPACE("evis.log_softmax_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \
         HASH_LOG_SOFTMAX_KERNEL_SOURCE_NAME(_suffix) },
 
-static const struct {
+#define HASH_LOG_SOFTMAX_EXCEED_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, _suffix) \
+        { HASH_LOG_SOFTMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \
+        CVIVANTE_NAMESPACE("evis.log_softmax_exceed_axis"#AXIS"_"#IN_DTYPE"to"#OUT_DTYPE), \
+        HASH_LOG_SOFTMAX_EXCEED_KERNEL_SOURCE_NAME(_suffix) },
+
+typedef struct {
         uint32_t key;
         char* function_name;
         const char* source_name;
-    } _log_softmax_evis_kernel_map[] =
+    } _kernel_map_type;
+
+static const _kernel_map_type _log_softmax_evis_kernel_map[] =
 {
     HASH_LOG_SOFTMAX_KERNELS(0, F16,  F16,  0)
     HASH_LOG_SOFTMAX_KERNELS(0, F16,  I16,  0)
@@ -126,6 +140,49 @@ static const struct {
 
 };
 
+static const _kernel_map_type _log_softmax_exceed_evis_kernel_map[] =
+{
+
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, F16,  F16,  0)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, F16,  I16,  0)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, F16,  U8,   0)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, F16,  I8,   0)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, I16,  I16,  0)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, I16,  F16,  0)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, BF16, BF16, 0_BF16)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, BF16, F32,  0_BF16)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, BF16, F16,  0_BF16)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, U8,   U8,   0)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, U8,   F16,  0)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, I8,   I8,   0)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(0, I8,   F16,  0)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, F16,  F16,  1)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, F16,  I16,  1)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, F16,  U8,   1)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, F16,  I8,   1)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, I16,  I16,  1)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, I16,  F16,  1)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, BF16, BF16, 1_BF16)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, BF16, F32,  1_BF16)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, BF16, F16,  1_BF16)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, U8,   U8,   1)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, U8,   F16,  1)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, I8,   I8,   1)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(1, I8,   F16,  1)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, F16,  F16,  2)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, F16,  I16,  2)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, F16,  U8,   2)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, F16,  I8,   2)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, I16,  I16,  2)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, I16,  F16,  2)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, BF16, BF16, 2)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, U8,   U8,   2)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, U8,   F16,  2)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, I8,   I8,   2)
+    HASH_LOG_SOFTMAX_EXCEED_KERNELS(2, I8,   F16,  2)
+
+};
+
 static vx_param_description_t kernel_param_def[] =
 {
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
@@ -133,7 +190,9 @@ static vx_param_description_t kernel_param_def[] =
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
-#define _EVIS_PARAM_NUM          _cnt_of_array(kernel_param_def)
+
+
+#define _EVIS_PARAM_NUM                  _cnt_of_array(kernel_param_def)
 
 #define SCALAR_INPUT_AXIS          (2)
 #define SCALAR_INPUT_BETA          (3)
@@ -157,7 +216,7 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer)
     float       beta                        = 0;
     float      input_scale                  = 0;
     float      output_scale                 = 0;
-    int32_t    outputZP                     = 0;
+    float      outputZP                     = 0;
     uint32_t   inputWidth                   = 0;
     uint32_t   inputWidthRemain4            = 0;
     vsi_nn_kernel_tensor_attr_t * attr[2]   = { NULL, NULL };
@@ -385,62 +444,25 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer)
         }
     }
 
+    outputZP = (float)attr[1]->zero_point;
+    output_scale = 1.0f / (float)(attr[1]->scale);
+
     if( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
-        int32_t fl = attr[1]->dfp.fl;
-
-        if (fl > 0)
-        {
-            output_scale = (float)((int64_t)1 << fl);
-        }
-        else
-        {
-            output_scale = (float)1.0f / (float) ((int64_t)1 << -fl);
-        }
-
         status = vsi_nn_kernel_gpu_add_param( node,
             "outputScale", &output_scale );
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
     else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
     {
-        float output_offset_asymmetric = 0;
-        outputZP = attr[1]->asymm.zero_point;
-        output_scale = 1.0f / (float)(attr[1]->asymm.scale);
-        output_offset_asymmetric = (float)outputZP;
-
         status = vsi_nn_kernel_gpu_add_param( node,
             "outputScale", &output_scale );
         status |= vsi_nn_kernel_gpu_add_param( node,
-            "output_offset_asymmetric", &output_offset_asymmetric );
+            "output_offset_asymmetric", &outputZP );
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
-    else
-    {
-        output_scale = 1;
-        outputZP     = 0;
-    }
 
-    if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[0]->dfp.fl;
-        if (fl > 0)
-        {
-            input_scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input_scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        input_scale = attr[0]->asymm.scale;
-    }
-    else
-    {
-        input_scale = 1.0f;
-    }
+    input_scale = attr[0]->scale;
 
     scaleLogE = scaleLogE * input_scale;
     beta = beta * input_scale;
@@ -471,6 +493,296 @@ final:
     return status;
 } /* _log_softmax_initializer() */
 
+DEF_KERNEL_INITIALIZER(_log_softmax_exceed_initializer)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    int32_t     axis                        = 0;
+    float       beta                        = 0;
+    float      input_scale                  = 0;
+    float      output_scale                 = 0;
+    float      outputZP                     = 0;
+    uint32_t   inputWidth                   = 0;
+    uint32_t   inputWidthRemain4            = 0;
+    int32_t    width                        = 0;
+    int32_t    height                       = 0;
+    int32_t    depth                        = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[2]   = { NULL, NULL };
+    vsi_size_array_t * output_shape          = NULL;
+    float   logE                            = (float)(log10(exp(1.0f)) / log10(2.0f));
+    float   rlogE                           = (float)(log10(2.0f) / log10(exp(1.0f)));
+    float   scaleLogE                       = 0;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &axis);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[3], &beta);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    scaleLogE = logE * beta;
+
+    output_shape  = attr[1]->shape;
+    width = (int32_t)output_shape->data[0];
+    height = (int32_t)output_shape->data[1];
+    depth = output_shape->size > 2 ? (int32_t)output_shape->data[2] : 1;
+    gpu_param.dim = 2;
+    switch (axis)
+    {
+        case 0:
+            gpu_param.global_scale[0] = 1;
+            gpu_param.global_scale[1] = 1;
+            gpu_param.global_size[0]  = 1;
+            gpu_param.global_size[1]  = depth;
+        break;
+        case 1:
+            gpu_param.global_scale[0] = 8;
+            gpu_param.global_scale[1] = 1;
+            gpu_param.global_size[0]  =
+            gpu_align_p2((output_shape->data[0] + gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
+            gpu_param.global_size[1]  = 1;
+        break;
+        default:
+        break;
+    }
+
+    {
+        gpu_dp_inst_t uniGetSubData0to3_4x4 = {{
+            0x09090909, // TCfg
+            0x04040404, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniGetSubData4to7_4x4 = {{
+            0x09090909, // TCfg
+            0x04040404, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniPackMaxData_2x8 = {{
+            0x00000111, // TCfg
+            0x00000000, // ASelt
+            0x00050300, 0x00000000, // ABin
+            0x00000222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00004400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x01050004, 0x03070206, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvBF16toF32_Part1_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x05050404, 0x07070606, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractHalf4_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00020000, 0x00060004, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniGetSubLoData_4x4 = {{
+            0x09090909, // TCfg
+            0x04040404, // ASelt
+            0x00110000, 0x00330022, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniGetSubHiData_4x4 = {{
+            0x09090909, // TCfg
+            0x04040404, // ASelt
+            0x00550044, 0x00770066, // ABin
+            0x0a0a0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00000000, 0x00010001, 0x00000000,
+            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractHalf8_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractOddData_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x07050301, 0x07050301, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        switch( axis )
+        {
+            case 0:
+            {
+                inputWidth        = (uint32_t)(output_shape->data[axis] / 4 * 4);
+                inputWidthRemain4 = (uint32_t)(output_shape->data[axis] % 4);
+
+                status = vsi_nn_kernel_gpu_add_param( node,
+                        "inputWidth", &inputWidth );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                        "inputWidthRemain4", &inputWidthRemain4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                        "uniPackMaxData_2x8", &uniPackMaxData_2x8 );
+                status |= vsi_nn_kernel_gpu_add_param( node, "axisSize", &width );
+                status |= vsi_nn_kernel_gpu_add_param( node, "height", &height);
+                if (attr[0]->dtype == BF16)
+                {
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniExtractHalf4_4x4", &uniExtractHalf4_4x4 );
+                }
+                else
+                {
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniGetSubData0to3_4x4", &uniGetSubData0to3_4x4 );
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniGetSubData4to7_4x4", &uniGetSubData4to7_4x4 );
+                }
+                CHECK_STATUS_FAIL_GOTO(status, final );
+            }
+            break;
+        case 1:
+            {
+                if (attr[0]->dtype == BF16)
+                {
+                    status = vsi_nn_kernel_gpu_add_param( node,
+                            "uniExtractHalf8_2x8", &uniExtractHalf8_2x8 );
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniConvBF16toF32_Part0_2x8", &uniConvBF16toF32_Part0_2x8 );
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniConvBF16toF32_Part1_2x8", &uniConvBF16toF32_Part1_2x8 );
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniExtractOddData_2x8", &uniExtractOddData_2x8 );
+                }
+                else
+                {
+                    status = vsi_nn_kernel_gpu_add_param( node,
+                            "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniGetSubLoData_4x4", &uniGetSubLoData_4x4 );
+                    status |= vsi_nn_kernel_gpu_add_param( node,
+                            "uniGetSubHiData_4x4", &uniGetSubHiData_4x4 );
+                }
+                status |= vsi_nn_kernel_gpu_add_param( node, "axisSize", &height );
+                status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth);
+                CHECK_STATUS_FAIL_GOTO(status, final );
+            }
+            break;
+        default:
+            break;
+        }
+    }
+
+    outputZP = (float)attr[1]->zero_point;
+    output_scale = 1.0f / attr[1]->scale;
+
+    if (attr[0]->dtype != BF16)
+    {
+        status = vsi_nn_kernel_gpu_add_param( node,
+            "outputScale", &output_scale );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+            "output_offset_asymmetric", &outputZP );
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+
+    input_scale = attr[0]->scale;
+
+    scaleLogE = scaleLogE * input_scale;
+    beta = beta * input_scale;
+
+    status |= vsi_nn_kernel_gpu_add_param( node,
+        "rlogE", &rlogE );
+    status |= vsi_nn_kernel_gpu_add_param( node,
+        "betaValue", &beta );
+    status |= vsi_nn_kernel_gpu_add_param( node,
+        "scaleLogE", &scaleLogE );
+
+    status |= vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+    }
+
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+    }
+
+    return status;
+
+}
+
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
@@ -513,7 +825,51 @@ static vsi_status _query_kernel
     return status;
 } /* _query_kernel() */
 
-static vsi_nn_kernel_node_t _setup
+static vsi_status _query_kernel_exceed
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    int32_t axis,
+    vsi_nn_kernel_t* kernel
+    )
+{
+    vsi_nn_kernel_dtype_e input_dtype;
+    vsi_nn_kernel_dtype_e output_dtype;
+    vsi_status status = VSI_FAILURE;
+    uint32_t key;
+    size_t i;
+
+    input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = HASH_LOG_SOFTMAX_HASH_KEY( axis, input_dtype, output_dtype, 0 );
+
+    for( i = 0; i < _cnt_of_array(_log_softmax_exceed_evis_kernel_map); i ++ )
+    {
+        if( _log_softmax_exceed_evis_kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if( i < _cnt_of_array(_log_softmax_exceed_evis_kernel_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _log_softmax_exceed_evis_kernel_map[i].function_name );
+        kernel->info.parameters = kernel_param_def;
+        kernel->info.numParams = _cnt_of_array( kernel_param_def );
+        kernel->info.initialize = _log_softmax_exceed_initializer;
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                _log_softmax_exceed_evis_kernel_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                _log_softmax_exceed_evis_kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+}
+
+static vsi_nn_kernel_node_t _setup_not_exceed
     (
     vsi_nn_graph_t              * graph,
     vsi_nn_tensor_t            ** inputs,
@@ -528,7 +884,13 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_EVIS_PARAM_NUM] = {NULL};
     vsi_bool image_2d = FALSE;
     vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+    uint32_t rank_in = 0;
     int32_t axis = 0;
+    int32_t new_axis = 0;
+    vsi_bool ret = vx_false_e;
+    uint32_t i   = 0;
     float beta = 1.0f;
 
     VSI_UNREFERENCED(input_num);
@@ -537,15 +899,31 @@ static vsi_nn_kernel_node_t _setup
     axis = vsi_nn_kernel_param_get_int32(params, "axis");
     beta = vsi_nn_kernel_param_get_float32(params, "beta");
 
-    if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
-                inputs[0]->attr.dim_num )
-     || axis > 2)
+    ret = vsi_nn_kernel_optimize_softmax_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
+            shapes[0], &rank_in, &new_axis);
+
+    if (ret)
+    {
+        reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+                inputs[0], shapes[0], rank_in );
+        reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+                outputs[0], shapes[0], rank_in );
+    }
+    else
     {
         return NULL;
     }
 
-    image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
-    status = _query_kernel( inputs, outputs, axis, image_2d, kernel );
+    if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[0]->attr.size,
+                reshape_tensors[0]->attr.dim_num )
+     || new_axis > 2)
+    {
+        return NULL;
+    }
+
+    image_2d = (reshape_tensors[0]->attr.dim_num == 2 || reshape_tensors[0]->attr.size[2] == 1);
+    status = _query_kernel( inputs, outputs, new_axis, image_2d, kernel );
     if( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
@@ -553,9 +931,9 @@ static vsi_nn_kernel_node_t _setup
         {
             /* Pass parameters to node. */
             vsi_nn_kernel_node_pack_io( node_params, _EVIS_PARAM_NUM,
-                    inputs, 1, outputs, 1 );
+                    reshape_tensors, 1, &reshape_tensors[1], 1 );
             node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(
-                    graph, I32, &axis );
+                    graph, I32, &new_axis );
             node_params[SCALAR_INPUT_BETA] = vsi_nn_kernel_scalar_create(
                     graph, F32, &beta );
 
@@ -565,10 +943,132 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_BETA] );
         }
     }
+
+    for (i = 0; i < 2; i++)
+    {
+        vsi_safe_release_tensor(reshape_tensors[i]);
+    }
+
     return node;
 } /* _setup() */
 
+static vsi_nn_kernel_node_t _setup_exceed
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_EVIS_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+    uint32_t rank_in = 0;
+    int32_t axis = 0;
+    int32_t new_axis = 0;
+    vsi_bool ret = vx_false_e;
+    uint32_t i   = 0;
+    float beta = 1.0f;
+
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
+    axis = vsi_nn_kernel_param_get_int32(params, "axis");
+    beta = vsi_nn_kernel_param_get_float32(params, "beta");
+
+    ret = vsi_nn_kernel_optimize_softmax_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
+            shapes[0], &rank_in, &new_axis);
+
+    if (ret)
+    {
+        reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+                inputs[0], shapes[0], rank_in );
+        reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+                outputs[0], shapes[0], rank_in );
+    }
+    else
+    {
+        return NULL;
+    }
+
+    if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[0]->attr.size,
+                reshape_tensors[0]->attr.dim_num )
+     || new_axis > 1)
+    {
+        return NULL;
+    }
+
+    status = _query_kernel_exceed(inputs, outputs, new_axis, kernel);
+    if( VSI_SUCCESS != status)
+    {
+        goto final;
+    }
+
+    node = vsi_nn_kernel_create_node( graph, kernel );
+    CHECK_PTR_FAIL_GOTO( node, "Create kernel fail.", final );
+    if (node)
+    {
+        vsi_nn_kernel_node_pack_io(node_params, _EVIS_PARAM_NUM,
+                                   reshape_tensors,
+                                   input_num,
+                                   &reshape_tensors[1],
+                                   output_num);
+        node_params[2] = vsi_nn_kernel_scalar_create(graph, I32, &new_axis );
+        node_params[3] = vsi_nn_kernel_scalar_create(graph, F32, &beta );
+
+        status = vsi_nn_kernel_node_pass_param(
+            node, node_params, _EVIS_PARAM_NUM);
+        CHECK_STATUS(status);
+        vsi_nn_kernel_scalar_release( &node_params[2] );
+        vsi_nn_kernel_scalar_release( &node_params[3] );
+    }
+
+final:
+    for (i = 0; i < 2; i++)
+    {
+        vsi_safe_release_tensor(reshape_tensors[i]);
+    }
+
+    return node;
+}
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_size_t *input_size = inputs[0]->attr.size;
+    int32_t axis = 0;
+    axis = vsi_nn_kernel_param_get_int32(params, "axis");
+
+    if (input_size[axis] >= GPU_TENSOR_MAX_WIDTH)
+    {
+        node = _setup_exceed(graph, inputs, input_num, outputs, output_num, params, kernel);
+    }
+    else
+    {
+        node = _setup_not_exceed(graph, inputs, input_num, outputs, output_num, params, kernel);
+    }
+
+    return node;
+}
+
+
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( log_softmax, _setup )
-
+#endif
diff --git a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c
index 46ab93f..2ec1b1a 100644
--- a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c
@@ -996,18 +996,14 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
     float                        forget_bias            = 0.0f;
     float                        outputScale            = 1.0f;
     float                        outputZP               = 0;
-    int32_t                      dstZP                  = 0;
-    float                        dstScale               = 1.0f;
     vsi_nn_kernel_dtype_e        cellFormat             = F16;
     vsi_nn_kernel_dtype_e        dstFormat              = F16;
-    vsi_nn_kernel_quant_type_e   dstQuantType           = VSI_NN_KERNEL_QUANT_NONE;
-    int32_t                      dstFixPointPos         = 0;
-    float                        logE                   = (vx_float32)(log10(exp(1.0f)) / log10(2.0f));
+    float                        logE                   = (float)(log10(exp(1.0f)) / log10(2.0f));
     float                        twoLogE                = 2 * logE;
     uint32_t                     uint_min               = 0xFBFFFFFF;
     uint32_t                     uint_max               = 0x7BFFFFFF;
-    float                        float_min              = *(vx_float32 *)&uint_min;
-    float                        float_max              = *(vx_float32 *)&uint_max;
+    float                        float_min              = *(float *)&uint_min;
+    float                        float_max              = *(float *)&uint_max;
     float                        clip_Min_F[4]          = {0};
     float                        clip_Max_F[4]          = {0};
     uint32_t                     i                      = 0;
@@ -1063,22 +1059,11 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
     status = vsi_nn_kernel_scalar_read_float32( (vsi_nn_kernel_scalar_t)param[param_size - 1], &forget_bias );
     CHECK_STATUS_FAIL_GOTO(status, final );
 
-    cellFormat = attr[0]->dtype;
-    dstFormat   = attr[1]->dtype;
+    cellFormat   = attr[0]->dtype;
+    dstFormat    = attr[1]->dtype;
 
-    dstQuantType = attr[1]->quant;
-
-    if ( VSI_NN_KERNEL_QUANT_DFP == dstQuantType )
-    {
-        dstFixPointPos = (int8_t)attr[1]->dfp.fl;
-    }
-    else if ( VSI_NN_KERNEL_QUANT_ASYMM == dstQuantType )
-    {
-        dstZP = attr[1]->asymm.zero_point;
-        dstScale = attr[1]->asymm.scale;
-    }
-
-    outputZP  = (vx_float32)dstZP;
+    outputScale  = 1.0f / attr[1]->scale;
+    outputZP     = (float)attr[1]->zero_point;
 
     gpu_param.global_scale[0]  = 4;
     gpu_param.global_scale[1]  = 1;
@@ -1182,20 +1167,6 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
             0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
         }, GPU_DP_TYPE_16};
 
-        if (dstQuantType == VSI_NN_KERNEL_QUANT_DFP)
-        {
-            if (dstFixPointPos >= 0)
-                outputScale *= (vx_float32)((int64_t)1 << dstFixPointPos);
-            else if (dstFixPointPos < 0)
-                outputScale *= 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos);
-
-            outputZP = 0;
-        }
-        else if (dstQuantType == VSI_NN_KERNEL_QUANT_ASYMM)
-        {
-            outputScale = 1.0f / dstScale;
-        }
-
         if ( cellFormat == F16 )
         {
             status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractHalf4_4x4", &uniExtractHalf4_4x4);
diff --git a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
index 1b15caa..b643d9b 100644
--- a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
@@ -288,67 +288,13 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &K);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
-    src0ZP     = attr[0]->asymm.zero_point;
-    src0Scale  = attr[0]->asymm.scale;
-    src1ZP     = attr[1]->asymm.zero_point;
-    src1Scale  = attr[1]->asymm.scale;
-    dstZP      = (float)attr[2]->asymm.zero_point;
-    dstScale   = attr[2]->asymm.scale;
+    src0ZP     = attr[0]->zero_point;
+    src0Scale  = attr[0]->scale;
+    src1ZP     = attr[1]->zero_point;
+    src1Scale  = attr[1]->scale;
+    dstZP      = (float)attr[2]->zero_point;
+    dstScale   = attr[2]->scale;
 
-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        src0ZP = 0;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
-    {
-        src0Scale = 1;
-        src0ZP = 0;
-    }
-
-    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[1]->dfp.fl > 0)
-        {
-            src1Scale = (1.0f / ((float) ((int64_t)1 << attr[1]->dfp.fl)));
-        }
-        else
-        {
-            src1Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl));
-        }
-        src1ZP = 0;
-    }
-    else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
-    {
-        src1Scale = 1;
-        src1ZP = 0;
-    }
-
-    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[2]->dfp.fl > 0)
-        {
-            dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
-        }
-        else
-        {
-            dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
-        }
-        dstScale = 1.0f / dstScale;
-        dstZP = 0.0f;
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        dstScale = 1;
-        dstZP = 0.0f;
-    }
     gpu_quantize_multiplier_16bit(src0Scale / 1.0f, &M0, &postShift0);
     gpu_quantize_multiplier_16bit(src1Scale / 1.0f, &M1, &postShift1);
 
@@ -1266,67 +1212,12 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_cross_initializer)
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &axis_size);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
-    src0ZP     = attr[0]->asymm.zero_point;
-    src0Scale  = attr[0]->asymm.scale;
-    src1ZP     = attr[1]->asymm.zero_point;
-    src1Scale  = attr[1]->asymm.scale;
-    dstZP      = (float)attr[2]->asymm.zero_point;
-    dstScale   = attr[2]->asymm.scale;
-
-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            src0Scale = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            src0Scale = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        src0ZP = 0;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
-    {
-        src0Scale = 1;
-        src0ZP = 0;
-    }
-
-    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[1]->dfp.fl > 0)
-        {
-            src1Scale = (1.0f / ((float) ((int64_t)1 << attr[1]->dfp.fl)));
-        }
-        else
-        {
-            src1Scale = ((float) ((int64_t)1 << -attr[1]->dfp.fl));
-        }
-        src1ZP = 0;
-    }
-    else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
-    {
-        src1Scale = 1;
-        src1ZP = 0;
-    }
-
-    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[2]->dfp.fl > 0)
-        {
-            dstScale = (float)((int64_t)1 << attr[2]->dfp.fl);
-        }
-        else
-        {
-            dstScale = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
-        }
-        dstScale = 1.0f / dstScale;
-        dstZP = 0.0f;
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        dstScale = 1;
-        dstZP = 0.0f;
-    }
+    src0ZP     = attr[0]->zero_point;
+    src0Scale  = attr[0]->scale;
+    src1ZP     = attr[1]->zero_point;
+    src1Scale  = attr[1]->scale;
+    dstZP      = (float)attr[2]->zero_point;
+    dstScale   = attr[2]->scale;
 
     mulKIn0In1Zp = (float)((int)(K + 3) / 4 * 4 * src1ZP * src0ZP);
     inOutScale =  src0Scale * src1Scale / dstScale;
diff --git a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
index d862eb7..4e319da 100644
--- a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c
@@ -163,63 +163,12 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer)
     CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
 
     out_shape  = attr[2]->shape;
-
-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[0]->dfp.fl;
-        if (fl > 0)
-        {
-            input0_scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input0_scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-        || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM)
-    {
-        input0_zp     = attr[0]->asymm.zero_point;
-        input0_scale  = attr[0]->asymm.scale;
-    }
-
-    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[1]->dfp.fl;
-        if (fl > 0)
-        {
-            input1_scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input1_scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-        || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
-    {
-        input1_zp     = attr[1]->asymm.zero_point;
-        input1_scale  = attr[1]->asymm.scale;
-    }
-
-    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[2]->dfp.fl;
-        if (fl > 0)
-        {
-            output_scale = (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            output_scale = 1.0f / (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-        || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM)
-    {
-        output_zp     = attr[2]->asymm.zero_point;
-        output_scale  = 1.0f / attr[2]->asymm.scale;
-    }
+    input0_zp    = attr[0]->zero_point;
+    input0_scale = attr[0]->scale;
+    input1_zp    = attr[1]->zero_point;
+    input1_scale = attr[1]->scale;
+    output_zp    = attr[2]->zero_point;
+    output_scale = 1.0f / attr[2]->scale;
 
 #define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE )    \
         (IN0_TYPE | (IN1_TYPE << 8) | ( OUT_TYPE << 16))
@@ -454,30 +403,52 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_tensor_t* tmp_inputs[2] = { NULL };
     vsi_nn_type_e dtype1 = inputs[0]->attr.dtype.vx_type;
     vsi_nn_type_e dtype2 = inputs[1]->attr.dtype.vx_type;
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+    vsi_size_t new_rank = 0;
+    vsi_bool ret = TRUE;
 
     VSI_UNREFERENCED(input_num);
     VSI_UNREFERENCED(output_num);
     VSI_UNREFERENCED(params);
 
-    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
+    ret = vsi_nn_kernel_optimize_eltwise_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            inputs[1]->attr.size, inputs[1]->attr.dim_num,
+            outputs[0]->attr.size, outputs[0]->attr.dim_num,
+            shapes[0], shapes[1], shapes[2], &new_rank );
+
+    if (ret == FALSE)
     {
-        return NULL;
+        goto final;
+    }
+
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+            inputs[0], shapes[0], new_rank );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+            inputs[1], shapes[1], new_rank );
+    reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
+            outputs[0], shapes[2], new_rank );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size,
+                reshape_tensors[2]->attr.dim_num ) )
+    {
+        goto final;
     }
 
     // Reorder tensor
     if ( dtype1 != dtype2 && dtype1 == VSI_NN_TYPE_FLOAT16 )
     {
         int32_t order[2] = {1, 0};
-        vsi_nn_reorder_tensor( inputs, order, 2, tmp_inputs );
+        vsi_nn_reorder_tensor( reshape_tensors, order, 2, tmp_inputs );
     }
     else
     {
-        memmove( tmp_inputs, inputs, sizeof(vsi_nn_tensor_t*) * 2 );
+        memmove( tmp_inputs, reshape_tensors, sizeof(vsi_nn_tensor_t*) * 2 );
     }
 
-    image_2d = (outputs[0]->attr.dim_num == 2);
-    status = _query_kernel( tmp_inputs, outputs, image_2d, kernel );
+    image_2d = (reshape_tensors[2]->attr.dim_num == 2);
+    status = _query_kernel( tmp_inputs, &reshape_tensors[2], image_2d, kernel );
     if ( VSI_SUCCESS == status )
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
@@ -485,10 +456,16 @@ static vsi_nn_kernel_node_t _setup
         {
             /* Pass parameters to node. */
             vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM,
-                    tmp_inputs, 2, outputs, 1 );
+                    tmp_inputs, 2, &reshape_tensors[2], 1 );
             status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM );
         }
     }
+
+final:
+    vsi_safe_release_tensor(reshape_tensors[0]);
+    vsi_safe_release_tensor(reshape_tensors[1]);
+    vsi_safe_release_tensor(reshape_tensors[2]);
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
index cb9fc35..a86d57a 100644
--- a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c
@@ -163,63 +163,12 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer)
     CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
 
     out_shape  = attr[2]->shape;
-
-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[0]->dfp.fl;
-        if (fl > 0)
-        {
-            input0_scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input0_scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-        || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM)
-    {
-        input0_zp     = attr[0]->asymm.zero_point;
-        input0_scale  = attr[0]->asymm.scale;
-    }
-
-    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[1]->dfp.fl;
-        if (fl > 0)
-        {
-            input1_scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input1_scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-        || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
-    {
-        input1_zp     = attr[1]->asymm.zero_point;
-        input1_scale  = attr[1]->asymm.scale;
-    }
-
-    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[2]->dfp.fl;
-        if (fl > 0)
-        {
-            output_scale = (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            output_scale = 1.0f / (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-        || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM)
-    {
-        output_zp     = attr[2]->asymm.zero_point;
-        output_scale  = 1.0f / attr[2]->asymm.scale;
-    }
+    input0_zp     = attr[0]->zero_point;
+    input0_scale  = attr[0]->scale;
+    input1_zp     = attr[1]->zero_point;
+    input1_scale  = attr[1]->scale;
+    output_zp     = attr[2]->zero_point;
+    output_scale  = 1.0f / attr[2]->scale;
 
 #define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE )    \
         (IN0_TYPE | (IN1_TYPE << 8) | ( OUT_TYPE << 16))
@@ -454,30 +403,52 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_tensor_t* tmp_inputs[2] = { NULL };
     vsi_nn_type_e dtype1 = inputs[0]->attr.dtype.vx_type;
     vsi_nn_type_e dtype2 = inputs[1]->attr.dtype.vx_type;
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+    vsi_size_t new_rank = 0;
+    vsi_bool ret = TRUE;
 
     VSI_UNREFERENCED(input_num);
     VSI_UNREFERENCED(output_num);
     VSI_UNREFERENCED(params);
 
-    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
+    ret = vsi_nn_kernel_optimize_eltwise_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            inputs[1]->attr.size, inputs[1]->attr.dim_num,
+            outputs[0]->attr.size, outputs[0]->attr.dim_num,
+            shapes[0], shapes[1], shapes[2], &new_rank );
+
+    if (ret == FALSE)
     {
-        return NULL;
+        goto final;
+    }
+
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+            inputs[0], shapes[0], new_rank );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+            inputs[1], shapes[1], new_rank );
+    reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
+            outputs[0], shapes[2], new_rank );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size,
+                reshape_tensors[2]->attr.dim_num ) )
+    {
+        goto final;
     }
 
     // Reorder tensor
     if ( dtype1 != dtype2 && dtype1 == VSI_NN_TYPE_FLOAT16 )
     {
         int32_t order[2] = {1, 0};
-        vsi_nn_reorder_tensor( inputs, order, 2, tmp_inputs );
+        vsi_nn_reorder_tensor( reshape_tensors, order, 2, tmp_inputs );
     }
     else
     {
-        memmove( tmp_inputs, inputs, sizeof(vsi_nn_tensor_t*) * 2 );
+        memmove( tmp_inputs, reshape_tensors, sizeof(vsi_nn_tensor_t*) * 2 );
     }
 
-    image_2d = (outputs[0]->attr.dim_num == 2);
-    status = _query_kernel( tmp_inputs, outputs, image_2d, kernel );
+    image_2d = (reshape_tensors[2]->attr.dim_num == 2);
+    status = _query_kernel( tmp_inputs, &reshape_tensors[2], image_2d, kernel );
     if ( VSI_SUCCESS == status )
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
@@ -485,10 +456,16 @@ static vsi_nn_kernel_node_t _setup
         {
             /* Pass parameters to node. */
             vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PARAM_NUM,
-                    tmp_inputs, 2, outputs, 1 );
+                    tmp_inputs, 2, &reshape_tensors[2], 1 );
             status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PARAM_NUM );
         }
     }
+
+final:
+    vsi_safe_release_tensor(reshape_tensors[0]);
+    vsi_safe_release_tensor(reshape_tensors[1]);
+    vsi_safe_release_tensor(reshape_tensors[2]);
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/evis/mod_evis.c b/src/tim/vx/internal/src/kernel/evis/mod_evis.c
index 70188f6..a2e28bf 100644
--- a/src/tim/vx/internal/src/kernel/evis/mod_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/mod_evis.c
@@ -128,9 +128,6 @@ DEF_KERNEL_INITIALIZER(_mod_initializer)
     vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
     vsi_size_array_t             *output_shape = NULL;
     vsi_nn_kernel_dtype_e        input0_dtype = F16;
-    int32_t                      input0_fl    = 0;
-    int32_t                      input1_fl    = 0;
-    int32_t                      output_fl    = 0;
     float                        inScale0     = 1.0f;
     float                        inScale1     = 1.0f;
     float                        outScale     = 1.0f;
@@ -168,59 +165,12 @@ DEF_KERNEL_INITIALIZER(_mod_initializer)
                                  (output_shape->data[2] + gpu_param.global_scale[2] - 1)
                                              / gpu_param.global_scale[2] : 1;
 
-    if (input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        input0_fl = input0_attr->dfp.fl;
-        if (input0_fl > 0)
-        {
-            inScale0 = 1.0f / (float) ((int64_t)1 << input0_fl);
-        }
-        else
-        {
-            inScale0 = (float)((int64_t)1 << -input0_fl);
-        }
-    }
-    else if (input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        inScale0   = input0_attr->asymm.scale;
-        in0Tail    = -inScale0 * ((float)input0_attr->asymm.zero_point);
-    }
-
-    if (input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        input1_fl = input1_attr->dfp.fl;
-        if (input1_fl > 0)
-        {
-            inScale1 = 1.0f / (float) ((int64_t)1 << input1_fl);
-        }
-        else
-        {
-            inScale1 = (float)((int64_t)1 << -input1_fl);
-        }
-    }
-    else if (input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        inScale1   = input1_attr->asymm.scale;
-        in1Tail    = -inScale1 * ((float)input1_attr->asymm.zero_point);
-    }
-
-    if (output_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        output_fl = output_attr->dfp.fl;
-        if (output_fl > 0)
-        {
-            outScale = (float) ((int64_t)1 << output_fl);
-        }
-        else
-        {
-            outScale = 1.0f / (float)((int64_t)1 << -output_fl);
-        }
-    }
-    else if (output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        outScale    = 1.0f / output_attr->asymm.scale;
-        outZp       = (float)(output_attr->asymm.zero_point);
-    }
+    inScale0   = input0_attr->scale;
+    in0Tail    = 0 - inScale0 * ((float)input0_attr->zero_point);
+    inScale1   = input1_attr->scale;
+    in1Tail    = 0 - inScale1 * ((float)input1_attr->zero_point);
+    outScale   = 1.0f / output_attr->scale;
+    outZp      = (float)(output_attr->zero_point);
 
     if (BF16 == input0_dtype)
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/moments_evis.c b/src/tim/vx/internal/src/kernel/evis/moments_evis.c
index 9dc6eae..18bb050 100644
--- a/src/tim/vx/internal/src/kernel/evis/moments_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/moments_evis.c
@@ -239,76 +239,12 @@ DEF_KERNEL_INITIALIZER(_moments_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     input_shape  = attr[0]->shape;
-
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        input_zp = attr[0]->asymm.zero_point;
-        scaleIn  = attr[0]->asymm.scale;
-    }
-    else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-
-        input_zp = 0;
-    }
-    else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
-    {
-        input_zp = 0;
-        scaleIn = 1;
-    }
-
-    if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        output_ZP0     = (float)attr[1]->asymm.zero_point;
-        outputScale0   = 1.0f / attr[1]->asymm.scale;
-    }
-    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[1]->dfp.fl > 0)
-        {
-            outputScale0 = (float)((int64_t)1 << attr[1]->dfp.fl);
-        }
-        else
-        {
-            outputScale0 = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl));
-        }
-        output_ZP0 = 0.0f;
-    }
-    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        outputScale0 = 1.0f;
-        output_ZP0 = 0.0f;
-    }
-
-    if (attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        output_ZP1     = (float)attr[2]->asymm.zero_point;
-        outputScale1   = 1.0f / attr[2]->asymm.scale;
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[2]->dfp.fl > 0)
-        {
-            outputScale1 = (float)((int64_t)1 << attr[2]->dfp.fl);
-        }
-        else
-        {
-            outputScale1 = (1.0f / (float)((int64_t)1 << -attr[2]->dfp.fl));
-        }
-        output_ZP1 = 0.0f;
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        outputScale1 = 1.0f;
-        output_ZP1 = 0.0f;
-    }
+    input_zp     = attr[0]->zero_point;
+    scaleIn      = attr[0]->scale;
+    output_ZP0   = (float)attr[1]->zero_point;
+    outputScale0 = 1.0f / attr[1]->scale;
+    output_ZP1   = (float)attr[2]->zero_point;
+    outputScale1 = 1.0f / attr[2]->scale;
 
     output_ZP[0] = output_ZP0;
     output_ZP[1] = output_ZP1;
diff --git a/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c
index de2d35a..46eaa81 100644
--- a/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c
@@ -160,16 +160,13 @@ DEF_KERNEL_INITIALIZER(_one_hot_initializer)
     in_shape = attr[0]->shape;
     depth = (int32_t)(attr[1]->shape->data[1]);
     input_dtype  = attr[0]->dtype;
+    input_zp = attr[0]->zero_point;
+    scaleIn  = attr[0]->scale;
 
     if (VSI_NN_KERNEL_QUANT_DFP == attr[0]->quant)
     {
         srcFixPointPos = attr[0]->dfp.fl;
     }
-    else if (VSI_NN_KERNEL_QUANT_ASYMM == attr[0]->quant)
-    {
-        input_zp = attr[0]->asymm.zero_point;
-        scaleIn  = attr[0]->asymm.scale;
-    }
 
     if (suffix_size == 1)
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c b/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c
index e45704f..55b7e59 100644
--- a/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c
@@ -155,41 +155,19 @@ DEF_KERNEL_INITIALIZER(_poolwithargmax_initializer)
     input_shape   = input_attr->shape;
     src_dtype     = input_attr->dtype;
     dst_dtype     = output_attr->dtype;
+    inputScale    = input_attr->scale;
+    input_ZP      = input_attr->zero_point;
+    outputScale   = output_attr->scale;
+    output_ZP     = output_attr->zero_point;
 
     if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
         input_fl = input_attr->dfp.fl;
-        if (input_fl > 0)
-        {
-            inputScale = 1.0f / (float) ((int64_t)1 << input_fl);
-        }
-        else
-        {
-            inputScale = (float)((int64_t)1 << -input_fl);
-        }
-    }
-    else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        inputScale   = input_attr->asymm.scale;
-        input_ZP     = input_attr->asymm.zero_point;
     }
 
     if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
         output_fl = output_attr->dfp.fl;
-        if (output_fl > 0)
-        {
-            outputScale = 1.0f / (float) ((int64_t)1 << output_fl);
-        }
-        else
-        {
-            outputScale = (float)((int64_t)1 << -output_fl);
-        }
-    }
-    else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        outputScale  = output_attr->asymm.scale;
-        output_ZP    = output_attr->asymm.zero_point;
     }
 
     if ( ( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
diff --git a/src/tim/vx/internal/src/kernel/evis/pow_evis.c b/src/tim/vx/internal/src/kernel/evis/pow_evis.c
index 679526e..8492528 100644
--- a/src/tim/vx/internal/src/kernel/evis/pow_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pow_evis.c
@@ -22,6 +22,7 @@
 *
 *****************************************************************************/
 
+#if !(VX_TENSOR_POW_API_SUPPORT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -158,64 +159,13 @@ DEF_KERNEL_INITIALIZER(_pow_initializer)
     attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
     CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
 
-    out_shape   = attr[2]->shape;
-
-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[0]->dfp.fl;
-        if (fl > 0)
-        {
-            input0_scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input0_scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-        || attr[0]->quant == VSI_NN_KERNEL_QUANT_SYMM )
-    {
-        input0_scale  = attr[0]->asymm.scale;
-        input0_tail = 0 - (float)attr[0]->asymm.zero_point * input0_scale;
-    }
-
-    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[1]->dfp.fl;
-        if (fl > 0)
-        {
-            input1_scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input1_scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-        || attr[1]->quant == VSI_NN_KERNEL_QUANT_SYMM)
-    {
-        input1_scale  = attr[1]->asymm.scale;
-        input1_tail = 0 - (float)attr[1]->asymm.zero_point * input1_scale;
-    }
-
-    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[2]->dfp.fl;
-        if (fl > 0)
-        {
-            output_scale = (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            output_scale = 1.0f / (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM
-        || attr[2]->quant == VSI_NN_KERNEL_QUANT_SYMM )
-    {
-        output_zp     = (float)attr[2]->asymm.zero_point;
-        output_scale  = 1.0f / attr[2]->asymm.scale;
-    }
+    out_shape    = attr[2]->shape;
+    input0_scale = attr[0]->scale;
+    input0_tail  = 0 - (float)attr[0]->zero_point * input0_scale;
+    input1_scale = attr[1]->scale;
+    input1_tail  = 0 - (float)attr[1]->zero_point * input1_scale;
+    output_zp    = (float)attr[2]->zero_point;
+    output_scale = 1.0f / attr[2]->scale;
 
 #define _PACK_SELECT_KEY( IN0_TYPE, IN1_TYPE, OUT_TYPE )    \
         (IN0_TYPE | (IN1_TYPE << 8) | ( OUT_TYPE << 16))
@@ -454,3 +404,4 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( pow, _setup )
+#endif
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c
index 52588a4..89b4785 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c
@@ -140,28 +140,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer)
     }
     enable_copy = (int32_t)(xRatio == (1 << 15) && yRatio == (1 << 15));
 
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            outputScale = (float)((int64_t)1 << attr[0]->dfp.fl);
-        }
-        else
-        {
-            outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        dstZP = 0;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        outputScale = 1.0f / attr[0]->asymm.scale;
-        dstZP = attr[0]->asymm.zero_point;
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        outputScale = 1;
-        dstZP = 0;
-    }
+    outputScale = 1.0f / attr[0]->scale;
+    dstZP = attr[0]->zero_point;
 
     shaderParam.global_scale[0]  = 4;
     shaderParam.global_scale[1]  = 1;
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c
index 1973eb2..1ea8250 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c
@@ -133,28 +133,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_copy_initializer)
     width      = (uint32_t)(out_shape->data[0]);
     height     = (uint32_t)(out_shape->data[1]);
 
-    if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            outputScale = (float)((int64_t)1 << attr[0]->dfp.fl);
-        }
-        else
-        {
-            outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        dstZP = 0.0f;
-    }
-    else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        outputScale = 1.0f / attr[0]->asymm.scale;
-        dstZP = (float)attr[0]->asymm.zero_point;
-    }
-    else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        outputScale = 1;
-        dstZP = 0.0f;
-    }
+    outputScale = 1.0f / attr[0]->scale;
+    dstZP = (float)attr[0]->zero_point;
 
     shaderParam.global_scale[0]  = 16;
     shaderParam.global_scale[1]  = 1;
@@ -232,33 +212,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_initializer)
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
 
     out_shape  = attr[0]->shape;
-    dstZP      = (float)attr[0]->asymm.zero_point;
-    outputScale   = attr[0]->asymm.scale;
+    dstZP      = (float)attr[0]->zero_point;
+    outputScale = 1.0f / attr[0]->scale;
     width      = (uint32_t)(out_shape->data[0]);
     height     = (uint32_t)(out_shape->data[1]);
 
-    if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            outputScale = (float)((int64_t)1 << attr[0]->dfp.fl);
-        }
-        else
-        {
-            outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        dstZP = 0.0f;
-    }
-    else if(attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        outputScale = 1.0f/outputScale;
-    }
-    else if( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        outputScale = 1;
-        dstZP = 0.0f;
-    }
-
     shaderParam.global_scale[0]  = 4;
     shaderParam.global_scale[1]  = 1;
     shaderParam.global_scale[2]  = 1;
@@ -499,8 +457,8 @@ OnError:
     }
     if (attr[1])
     {
-        vsi_nn_kernel_tensor_attr_release( &attr[0] );
-        attr[0] = NULL;
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
     }
 
     return status;
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_rggb_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_rggb_evis.c
new file mode 100644
index 0000000..a58c823
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_rggb_evis.c
@@ -0,0 +1,884 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
+
+__BEGIN_DECLS
+
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOF16 \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toF16")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOI16 \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toI16")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOU8  \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toU8")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOI8  \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toI8")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_COPY_U8TOU8  \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_copy_U8toU8")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_COPY_U8TOI8  \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_copy_U8toI8")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_COPY_U8TOI16  \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_copy_U8toI16")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_COPY_U8TOF16  \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_copy_U8toF16")
+
+// greater than a quarter
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOU8_GQ  \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toU8_gq")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOI8_GQ  \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toU8_gq")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOF16_GQ \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toF16_gq")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_SCALE_U8TOI16_GQ \
+    CVIVANTE_NAMESPACE("evis.pre_process_nv12_rggb_scale_U8toI16_gq")
+
+#define KERNEL_SOURCE_1    "pre_process_nv12_rggb_copy",
+#define KERNEL_SOURCE_2    "pre_process_nv12_rggb_scale",
+
+typedef enum
+{
+    COPY = 0,
+    SCALE,
+    TRANS
+} vsi_nn_kernel_convert_type_e;
+
+#define HASH_PRE_PROCESS_NV12_RGGB_KEY(_input0_type, _output_type, _convert_type, _greater_quarter) \
+    ((_input0_type << 24) | (_output_type << 16) | (_convert_type << 8) | (_greater_quarter))
+
+#define TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, SOURCE) \
+    { HASH_PRE_PROCESS_NV12_RGGB_KEY(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, 0), \
+        VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_##CONVERT_TYPE##_##IN0_TYPE##TO##OUT_TYPE, \
+        SOURCE },
+
+#define TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, SOURCE) \
+    { HASH_PRE_PROCESS_NV12_RGGB_KEY(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, 1), \
+        VX_KERNEL_NAME_PRE_PROCESS_NV12_RGGB_##CONVERT_TYPE##_##IN0_TYPE##TO##OUT_TYPE##_GQ, \
+        SOURCE },
+
+static const struct {
+        uint32_t key;
+        char* function_name;
+        const char* source_name;
+    } pre_process_nv12_rggb_map[] =
+{
+    TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, U8,  COPY,         KERNEL_SOURCE_1)
+    TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, I8,  COPY,         KERNEL_SOURCE_1)
+    TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, I16, COPY,         KERNEL_SOURCE_1)
+    TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, F16, COPY,         KERNEL_SOURCE_1)
+    TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, U8,  SCALE,        KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, I8,  SCALE,        KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, F16, SCALE,        KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_NV12_RGGB_KERNELS(U8, I16, SCALE,        KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(U8, U8,  SCALE,     KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(U8, F16, SCALE,     KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(U8, I8,  SCALE,     KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_NV12_RGGB_GQ_KERNELS(U8, I16, SCALE,     KERNEL_SOURCE_2)
+};
+
+static vx_param_description_t vxPreProcessNv12_RGGBKernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _EVIS_PRE_PROCESS_NV12_RGGB_PARAM_NUM          _cnt_of_array(vxPreProcessNv12_RGGBKernel_param_def)
+
+static vsi_bool _check_nv12_type_from_env()
+{
+    vsi_bool ret = FALSE;
+    char* env_s = vsi_nn_getenv("VSI_NN_ENABLE_OCV_NV12");
+    if (env_s)
+    {
+        ret = TRUE;
+    }
+    return ret;
+}
+
+DEF_KERNEL_INITIALIZER(_pre_process_nv12_rggb_copy_initializer)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        3,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    float       output_zp      = 0;
+    float       output_scale   = 1;
+    int32_t     reorder    = 0;
+    int32_t     order1     = 3;
+    uint32_t    width      = 0;
+    uint32_t    height     = 0;
+    int32_t     nv_type    = 0;
+    float       bMean = 0.0f, gMean= 0.0f, rMean = 0.0f;
+    float       b_scale = 0.0f, g_scale = 0.0f, r_scale = 0.0f;
+    float       outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f;
+    float       bMeanScaleVarZp = 0.0f,  gMeanScaleVarZp = 0.0f,  rMeanScaleVarZp = 0.0f;
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    vsi_size_array_t * out_shape = NULL;
+    vsi_bool ocv_nv12 = _check_nv12_type_from_env();
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &rMean);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &gMean);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &bMean);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &r_scale);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &reorder);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &nv_type);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[14], &g_scale);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[15], &b_scale);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    out_shape  = attr[0]->shape;
+    output_scale = 1.0f / attr[0]->scale;
+    output_zp = (float)attr[0]->zero_point;
+    width      = (uint32_t)(out_shape->data[0]);
+    height     = (uint32_t)(out_shape->data[1]);
+
+    if (reorder != 0)
+    {
+        reorder = 3;
+        order1 = 0;
+    }
+
+    if (nv_type == VSI_NN_YUV_TYPE_NV21_BGGR)
+    {
+        int32_t tmporder = reorder;
+        reorder = order1;
+        order1 = tmporder;
+    }
+
+    outputScaleVar_b = output_scale * b_scale;
+    outputScaleVar_g = output_scale * g_scale;
+    outputScaleVar_r = output_scale * r_scale;
+    bMeanScaleVarZp = output_zp - bMean * outputScaleVar_b;
+    gMeanScaleVarZp = output_zp - gMean * outputScaleVar_g;
+    rMeanScaleVarZp = output_zp - rMean * outputScaleVar_r;
+
+    shaderParam.global_scale[0]  = 4;
+    shaderParam.global_scale[1]  = 1;
+    shaderParam.global_scale[2]  = 1;
+    shaderParam.global_size[0]   = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
+        / shaderParam.global_scale[0], 4);
+    shaderParam.global_size[1]   = gpu_align_p2((height + shaderParam.global_scale[1] - 1)
+        / shaderParam.global_scale[1], 2);
+    shaderParam.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+                0x33333333, // TCfg
+                0x11110000, // ASelt
+                0x03020100, 0x03020100, // ABin
+                0x00000000, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertNV12toB_4x4 = {{
+                0x05050505, // TCfg
+                0x04040404, // ASelt
+                0x00210000, 0x00630042, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000,
+                0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertNV12toG_4x4 = {{
+                0x29292929, // TCfg
+                0x14141414, // ASelt
+                0x03210100, 0x07630542, // ABin
+                0x2a2a2a2a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc,
+                0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertNV12toR_4x4 = {{
+                0x05050505, // TCfg
+                0x04040404, // ASelt
+                0x00310010, 0x00730052, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000,
+                0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractYtoShortSub16_2x8 = {{
+                0x11111111, // TCfg
+                0x00000000, // ASelt
+                0x03020100, 0x07060504, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniExtractUVtoCharSub128_2x8 = {{
+            0x99999999, // TCfg
+            0x44444444, // ASelt
+            0x01000100, 0x03020302, // ABin
+            0xaaaaaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00010001, 0x00010001, 0x00010001,
+            0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertUchartoFp32_4x4 = {{
+            0x01010101,  // TCfg
+            0x00000000,  // ASelt
+            0x00010000,
+            0x00030002,  // ABin
+            0x02020202,  // BSelt
+            0x00000000,
+            0x00000000,  // BBin
+            0x00000400,  // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000  // Constant
+        }, GPU_DP_TYPE_16 };
+
+        if (ocv_nv12)
+        {
+            uniConvertNV12toB_4x4.data[2] = 0x00010000;
+            uniConvertNV12toB_4x4.data[3] = 0x00230022;
+            uniConvertNV12toB_4x4.data[8] = 0x40093ca7;
+            uniConvertNV12toB_4x4.data[10] = 0x40093ca7;
+            uniConvertNV12toB_4x4.data[12] = 0x40093ca7;
+            uniConvertNV12toB_4x4.data[14] = 0x40093ca7;
+
+            uniConvertNV12toG_4x4.data[2] = 0x01010100;
+            uniConvertNV12toG_4x4.data[3] = 0x03230322;
+            uniConvertNV12toG_4x4.data[8] = 0x36413ca7;
+            uniConvertNV12toG_4x4.data[9] = 0x00003a81;
+            uniConvertNV12toG_4x4.data[10] = 0x36413ca7;
+            uniConvertNV12toG_4x4.data[11] = 0x00003a81;
+            uniConvertNV12toG_4x4.data[12] = 0x36413ca7;
+            uniConvertNV12toG_4x4.data[13] = 0x00003a81;
+            uniConvertNV12toG_4x4.data[14] = 0x36413ca7;
+            uniConvertNV12toG_4x4.data[15] = 0x00003a81;
+
+            uniConvertNV12toR_4x4.data[2] = 0x00110010;
+            uniConvertNV12toR_4x4.data[3] = 0x00330032;
+            uniConvertNV12toR_4x4.data[8] = 0x3e623ca7;
+            uniConvertNV12toR_4x4.data[10] = 0x3e623ca7;
+            uniConvertNV12toR_4x4.data[12] = 0x3e623ca7;
+            uniConvertNV12toR_4x4.data[14] = 0x3e623ca7;
+
+            uniExtractUVtoCharSub128_2x8.data[2] = 0x03020100;
+            uniExtractUVtoCharSub128_2x8.data[3] = 0x07060504;
+
+            uniExtractYtoShortSub16_2x8.data[0] = 0x99999999;
+            uniExtractYtoShortSub16_2x8.data[1] = 0x44444444;
+            uniExtractYtoShortSub16_2x8.data[4] = 0xaaaaaaaa;
+            uniExtractYtoShortSub16_2x8.data[8] = 0x00010001;
+            uniExtractYtoShortSub16_2x8.data[9] = 0x00010001;
+            uniExtractYtoShortSub16_2x8.data[10] = 0x00010001;
+            uniExtractYtoShortSub16_2x8.data[11] = 0x00010001;
+            uniExtractYtoShortSub16_2x8.data[12] = 0x00010001;
+            uniExtractYtoShortSub16_2x8.data[13] = 0x00010001;
+            uniExtractYtoShortSub16_2x8.data[14] = 0x00010001;
+            uniExtractYtoShortSub16_2x8.data[15] = 0x00010001;
+        }
+
+        status = vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toB_4x4", &uniConvertNV12toB_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toG_4x4", &uniConvertNV12toG_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toR_4x4", &uniConvertNV12toR_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
+        status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractYtoShortSub16_2x8", &uniExtractYtoShortSub16_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r);
+        status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
+        status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
+        status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUchartoFp32_4x4", &uniConvertUchartoFp32_4x4);
+        CHECK_STATUS_FAIL_GOTO(status, OnError);
+        switch( attr[0]->dtype )
+        {
+        case U8:
+        case I8:
+        case I16:
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        case F16:
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        default:
+            break;
+        }
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _pre_process_nv12_rggb_copy_initializer() */
+
+DEF_KERNEL_INITIALIZER(_pre_process_nv12_rggb_initializer)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        3,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    float       output_zp      = 0;
+    float       output_scale   = 1;
+    int32_t     reorder    = 0;
+    int32_t     order1     = 3;
+    uint32_t    width      = 0;
+    uint32_t    height     = 0;
+    uint32_t    roi_width  = 0;
+    uint32_t    roi_height = 0;
+    uint32_t xrIntFloat_16 = 0;
+    uint32_t yrIntFloat_16 = 0;
+    int32_t     xRatio     = 0;
+    int32_t     yRatio     = 0;
+    int32_t     nv_type    = 0;
+    float       bMean = 0.0f, gMean= 0.0f, rMean = 0.0f;
+    float       b_scale = 0.0f, g_scale = 0.0f, r_scale = 0.0f;
+    float       outputScaleVar_b = 0.0f, outputScaleVar_g = 0.0f, outputScaleVar_r = 0.0f;
+    float       bMeanScaleVarZp = 0.0f,  gMeanScaleVarZp = 0.0f,  rMeanScaleVarZp = 0.0f;
+    float       resize     = 0.0f;
+    vsi_bool ocv_nv12 = _check_nv12_type_from_env();
+
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+    vsi_size_array_t * out_shape = NULL;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &xRatio);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &yRatio);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &rMean);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &gMean);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &bMean);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[10], &r_scale);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &reorder);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &nv_type);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[14], &g_scale);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[15], &b_scale);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    out_shape  = attr[1]->shape;
+    output_scale = 1.0f / attr[1]->scale;
+    output_zp = (float)attr[1]->zero_point;
+    width      = (uint32_t)(out_shape->data[0]);
+    height     = (uint32_t)(out_shape->data[1]);
+
+    if (reorder != 0)
+    {
+        reorder = 3;
+        order1 = 0;
+    }
+
+    if (nv_type == VSI_NN_YUV_TYPE_NV21_BGGR)
+    {
+        int32_t tmporder = reorder;
+        reorder = order1;
+        order1 = tmporder;
+    }
+
+    roi_width = (xRatio * width) >> 15;
+    roi_height = (yRatio * height) >> 15;
+    resize = (float)width / roi_width;
+    xrIntFloat_16 = (uint32_t)((roi_width << 16) / width + 1);
+    yrIntFloat_16 = (uint32_t)((roi_height << 16) / height + 1);
+
+    outputScaleVar_b = output_scale * b_scale;
+    outputScaleVar_g = output_scale * g_scale;
+    outputScaleVar_r = output_scale * r_scale;
+    bMeanScaleVarZp = output_zp - bMean * outputScaleVar_b;
+    gMeanScaleVarZp = output_zp - gMean * outputScaleVar_g;
+    rMeanScaleVarZp = output_zp - rMean * outputScaleVar_r;
+
+    shaderParam.global_scale[0]  = 4;
+    shaderParam.global_scale[1]  = 1;
+    shaderParam.global_scale[2]  = 1;
+    shaderParam.global_size[0]   = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
+        / shaderParam.global_scale[0], 4);
+    shaderParam.global_size[1]   = gpu_align_p2((height + shaderParam.global_scale[1] - 1)
+        / shaderParam.global_scale[1], 2);
+    shaderParam.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+                0x33333333, // TCfg
+                0x11110000, // ASelt
+                0x03020100, 0x03020100, // ABin
+                0x00000000, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniConvertNV12toB_4x4 = {{
+                0x05050505, // TCfg
+                0x04040404, // ASelt
+                0x00210000, 0x00630042, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000,
+                0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertNV12toG_4x4 = {{
+                0x29292929, // TCfg
+                0x14141414, // ASelt
+                0x03210100, 0x07630542, // ABin
+                0x2a2a2a2a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc,
+                0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertNV12toR_4x4 = {{
+                0x05050505, // TCfg
+                0x04040404, // ASelt
+                0x00310010, 0x00730052, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000,
+                0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertYtoShortSub16_2x8 = {{
+                0x11111111, // TCfg
+                0x00000000, // ASelt
+                0x03020100, 0x07060504, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertUVtoCharSub128_2x8 = {{
+            0x99999999, // TCfg
+            0x44444444, // ASelt
+            0x03020100, 0x07060504, // ABin
+            0xaaaaaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00010001, 0x00010001, 0x00010001, 0x00010001,
+            0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        //trans
+        gpu_dp_inst_t uniCalculateYShift_2x8 = {{
+            0x00009999, // TCfg
+            0x00000000, // ASelt
+            0x06040200, 0x00000000, // ABin
+            0x00005555, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniCalculateUVShift_2x8 = {{
+            0x51515151, // TCfg
+            0x40404040, // ASelt
+            0x02020000, 0x06060404, // ABin
+            0x91919191, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00010000, 0x00000000, 0x00010000,
+            0x00000000, 0x00010000, 0x00000000, 0x00010000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertUchartoFp32_4x4 = {{
+            0x01010101,  // TCfg
+            0x00000000,  // ASelt
+            0x00010000,
+            0x00030002,  // ABin
+            0x02020202,  // BSelt
+            0x00000000,
+            0x00000000,  // BBin
+            0x00000400,  // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000  // Constant
+        }, GPU_DP_TYPE_16 };
+
+        if (ocv_nv12)
+        {
+            uniConvertNV12toB_4x4.data[2] = 0x00010000;
+            uniConvertNV12toB_4x4.data[3] = 0x00230022;
+            uniConvertNV12toB_4x4.data[8] = 0x40093ca7;
+            uniConvertNV12toB_4x4.data[10] = 0x40093ca7;
+            uniConvertNV12toB_4x4.data[12] = 0x40093ca7;
+            uniConvertNV12toB_4x4.data[14] = 0x40093ca7;
+
+            uniConvertNV12toG_4x4.data[2] = 0x01010100;
+            uniConvertNV12toG_4x4.data[3] = 0x03230322;
+            uniConvertNV12toG_4x4.data[8] = 0x36413ca7;
+            uniConvertNV12toG_4x4.data[9] = 0x00003a81;
+            uniConvertNV12toG_4x4.data[10] = 0x36413ca7;
+            uniConvertNV12toG_4x4.data[11] = 0x00003a81;
+            uniConvertNV12toG_4x4.data[12] = 0x36413ca7;
+            uniConvertNV12toG_4x4.data[13] = 0x00003a81;
+            uniConvertNV12toG_4x4.data[14] = 0x36413ca7;
+            uniConvertNV12toG_4x4.data[15] = 0x00003a81;
+
+            uniConvertNV12toR_4x4.data[2] = 0x00110010;
+            uniConvertNV12toR_4x4.data[3] = 0x00330032;
+            uniConvertNV12toR_4x4.data[8] = 0x3e623ca7;
+            uniConvertNV12toR_4x4.data[10] = 0x3e623ca7;
+            uniConvertNV12toR_4x4.data[12] = 0x3e623ca7;
+            uniConvertNV12toR_4x4.data[14] = 0x3e623ca7;
+
+            uniConvertYtoShortSub16_2x8.data[0] = 0x99999999;
+            uniConvertYtoShortSub16_2x8.data[1] = 0x44444444;
+            uniConvertYtoShortSub16_2x8.data[4] = 0xaaaaaaaa;
+            uniConvertYtoShortSub16_2x8.data[8] = 0x00010001;
+            uniConvertYtoShortSub16_2x8.data[9] = 0x00010001;
+            uniConvertYtoShortSub16_2x8.data[10] = 0x00010001;
+            uniConvertYtoShortSub16_2x8.data[11] = 0x00010001;
+            uniConvertYtoShortSub16_2x8.data[12] = 0x00010001;
+            uniConvertYtoShortSub16_2x8.data[13] = 0x00010001;
+            uniConvertYtoShortSub16_2x8.data[14] = 0x00010001;
+            uniConvertYtoShortSub16_2x8.data[15] = 0x00010001;
+        }
+
+        status = vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toB_4x4", &uniConvertNV12toB_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toG_4x4", &uniConvertNV12toG_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toR_4x4", &uniConvertNV12toR_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUVtoCharSub128_2x8", &uniConvertUVtoCharSub128_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYtoShortSub16_2x8", &uniConvertYtoShortSub16_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node, "xrIntFloat_16", &xrIntFloat_16);
+        status |= vsi_nn_kernel_gpu_add_param(node, "yrIntFloat_16", &yrIntFloat_16);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_b", &outputScaleVar_b);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_g", &outputScaleVar_g);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar_r", &outputScaleVar_r);
+        status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
+        status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
+        status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
+
+        if (resize >= 0.25)
+        {
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateYShift_2x8", &uniCalculateYShift_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateUVShift_2x8", &uniCalculateUVShift_2x8);
+        }
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertUchartoFp32_4x4", &uniConvertUchartoFp32_4x4);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+        status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
+        status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+        switch( attr[1]->dtype )
+        {
+        case U8:
+        case I8:
+        case I16:
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        case F16:
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        default:
+            break;
+        }
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+    return status;
+} /* _pre_process_nv12_rggb_initializer() */
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel,
+    const vsi_nn_kernel_param_t * params,
+    int32_t scale_x
+    )
+{
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    vsi_nn_kernel_convert_type_e convert_type = SCALE;
+    vsi_status status = VSI_FAILURE;
+    uint32_t key = 0;
+    size_t i = 0;
+    vsi_bool enable_copy  = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
+    vsi_size_t dstWidth = outputs[0]->attr.size[0];
+    float scaleVal = (float)dstWidth / ((scale_x * dstWidth) >> 15);
+    uint32_t optFlg = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (enable_copy)
+    {
+        convert_type = COPY;
+    }
+    else
+    {
+        convert_type = SCALE;
+    }
+
+    if (scaleVal >= 0.25 && convert_type == SCALE)
+    {
+        optFlg = 1;
+    }
+
+    key = HASH_PRE_PROCESS_NV12_RGGB_KEY( input0_dtype, output_dtype, convert_type, optFlg );
+
+    for ( i = 0; i < _cnt_of_array(pre_process_nv12_rggb_map); i ++ )
+    {
+        if ( pre_process_nv12_rggb_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(pre_process_nv12_rggb_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  pre_process_nv12_rggb_map[i].function_name );
+        kernel->info.parameters = vxPreProcessNv12_RGGBKernel_param_def;
+        kernel->info.numParams = _cnt_of_array( vxPreProcessNv12_RGGBKernel_param_def );
+
+        if (convert_type == COPY)
+        {
+            kernel->info.initialize = _pre_process_nv12_rggb_copy_initializer;
+        }
+        else
+        {
+            kernel->info.initialize = _pre_process_nv12_rggb_initializer;
+        }
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                pre_process_nv12_rggb_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                pre_process_nv12_rggb_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_NV12_RGGB_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t trans = 0;
+    int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
+
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( inputs, outputs, kernel, params, scale_x );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            uint32_t index = 3;
+            int32_t scale_y  = vsi_nn_kernel_param_get_int32( params, "scale_y" );
+            int32_t left     = vsi_nn_kernel_param_get_int32( params, "left" );
+            int32_t top      = vsi_nn_kernel_param_get_int32( params, "top" );
+            float r_mean     = vsi_nn_kernel_param_get_float32( params, "r_mean" );
+            float g_mean     = vsi_nn_kernel_param_get_float32( params, "g_mean" );
+            float b_mean     = vsi_nn_kernel_param_get_float32( params, "b_mean" );
+            float r_scale    = vsi_nn_kernel_param_get_float32( params, "r_scale" );
+            float g_scale    = vsi_nn_kernel_param_get_float32( params, "g_scale" );
+            float b_scale    = vsi_nn_kernel_param_get_float32( params, "b_scale" );
+            int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );
+            int32_t nv_type  = vsi_nn_kernel_param_get_int32( params, "nv_type" );
+
+            /* Pass parameters to node. */
+            vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_NV12_RGGB_PARAM_NUM,
+                    inputs, 2, outputs, 1 );
+
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_scale );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &nv_type );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_scale );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_scale );
+            status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_NV12_RGGB_PARAM_NUM );
+            CHECK_STATUS(status);
+            vsi_nn_kernel_scalar_release( &tmp_params[3] );
+            vsi_nn_kernel_scalar_release( &tmp_params[4] );
+            vsi_nn_kernel_scalar_release( &tmp_params[5] );
+            vsi_nn_kernel_scalar_release( &tmp_params[6] );
+            vsi_nn_kernel_scalar_release( &tmp_params[7] );
+            vsi_nn_kernel_scalar_release( &tmp_params[8] );
+            vsi_nn_kernel_scalar_release( &tmp_params[9] );
+            vsi_nn_kernel_scalar_release( &tmp_params[10] );
+            vsi_nn_kernel_scalar_release( &tmp_params[11] );
+            vsi_nn_kernel_scalar_release( &tmp_params[12] );
+            vsi_nn_kernel_scalar_release( &tmp_params[13] );
+            vsi_nn_kernel_scalar_release( &tmp_params[14] );
+            vsi_nn_kernel_scalar_release( &tmp_params[15] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( pre_process_nv12_rggb, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
index 256f7e5..d9f96b2 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
@@ -403,23 +403,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer)
 
     out_shape  = attr[0]->shape;
     width      = (uint32_t)(out_shape->data[0]);
-
-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if ( attr[0]->dfp.fl > 0 )
-        {
-            output_scale *= (float)((int64_t)1 << attr[0]->dfp.fl);
-        }
-        else
-        {
-            output_scale *= (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        output_zp = (float)attr[0]->asymm.zero_point;
-        output_scale /= attr[0]->asymm.scale;
-    }
+    output_zp  = (float)attr[0]->zero_point;
+    output_scale = 1.0f / attr[0]->scale;
 
     shaderParam.global_scale[0]  = 16;
     shaderParam.global_scale[1]  = 1;
@@ -620,8 +605,8 @@ OnError:
     }
     if (attr[1])
     {
-        vsi_nn_kernel_tensor_attr_release( &attr[0] );
-        attr[0] = NULL;
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
     }
 
     return status;
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c
index ae559da..0504dff 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c
@@ -463,22 +463,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb888_planar_copy_initializer)
     width = (uint32_t)(out_shape->data[0] / 3);
     height = (uint32_t)(out_shape->data[1]);
 
-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        if ( attr[0]->dfp.fl > 0 )
-        {
-            output_scale *= (float)((int64_t)1 << attr[0]->dfp.fl);
-        }
-        else
-        {
-            output_scale *= (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        output_zp = (float)attr[0]->asymm.zero_point;
-        output_scale /= attr[0]->asymm.scale;
-    }
+    output_zp = (float)attr[0]->zero_point;
+    output_scale = 1.0f / attr[0]->scale;
 
     if (attr[0]->dtype == F16 || attr[0]->dtype == I16)
     {
@@ -787,8 +773,8 @@ OnError:
     }
     if (attr[1])
     {
-        vsi_nn_kernel_tensor_attr_release( &attr[0] );
-        attr[0] = NULL;
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
     }
 
     return status;
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
index 984293b..dd27137 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c
@@ -179,28 +179,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer)
     }
     enable_copy = (int32_t)(xRatio == (1 << 15) && yRatio == (1 << 15));
 
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            outputScale = (float)((int64_t)1 << attr[0]->dfp.fl);
-        }
-        else
-        {
-            outputScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        outputZP = 0;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        outputScale = 1.0f / attr[0]->asymm.scale;
-        outputZP = (float)attr[0]->asymm.zero_point;
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        outputScale = 1;
-        outputZP = 0;
-    }
+    outputScale = 1.0f / attr[0]->scale;
+    outputZP = (float)attr[0]->zero_point;
 
 #define _PACK_SELECT_KEY( COPY_FLAG, REVERSE_FLAG, TRANS_FLAG)    \
         (COPY_FLAG | (REVERSE_FLAG << 24) | (TRANS_FLAG << 16) )
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
index 4c322a8..1956b29 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c
@@ -143,23 +143,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer)
         order1 = 0;
     }
 
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        dstScale = 1.0f / attr[0]->asymm.scale;
-        dstZP = attr[0]->asymm.zero_point;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            dstScale = (float)((int64_t)1 << attr[0]->dfp.fl);
-        }
-        else
-        {
-            dstScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        dstZP = 0;
-    }
+    dstScale = 1.0f / attr[0]->scale;
+    dstZP = attr[0]->zero_point;
 
     shaderParam.global_scale[0]  = 16;
     shaderParam.global_scale[1]  = 1;
@@ -501,8 +486,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     out_shape  = attr[0]->shape;
-    dstZP      = attr[0]->asymm.zero_point;
-    dstScale   = attr[0]->asymm.scale;
+    dstZP      = attr[0]->zero_point;
+    dstScale   = 1.0f / attr[0]->scale;
     width      = (uint32_t)(out_shape->data[0]);
     height     = (uint32_t)(out_shape->data[1]);
 
@@ -512,28 +497,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer)
         order1 = 0;
     }
 
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            dstScale = (vx_float32)((int64_t)1 << attr[0]->dfp.fl);
-        }
-        else
-        {
-            dstScale = (1.0f / (vx_float32)((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        dstZP = 0;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        dstScale = 1.0f / dstScale;
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        dstScale = 1;
-        dstZP = 0;
-    }
-
     shaderParam.global_scale[0]  = 4;
     shaderParam.global_scale[1]  = 1;
     shaderParam.global_scale[2]  = 1;
diff --git a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c
index bed0b6c..4703424 100644
--- a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c
@@ -164,46 +164,24 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer)
     attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
     CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
 
-    out_shape  = attr[2]->shape;
+    out_shape     = attr[2]->shape;
+    inputZP0      = attr[0]->zero_point;
+    input_scale0  = attr[0]->scale;
+    inputZP1      = attr[1]->zero_point;
+    input_scale1  = attr[1]->scale;
+    outputZP      = (float)attr[2]->zero_point;
+    input_scale0  = input_scale0 / attr[2]->scale;
+
     if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
         in0_fl = (int8_t)attr[0]->dfp.fl;
-        if (in0_fl >= 0)
-        {
-            input_scale0 = 1.0f / (vx_float32) ((int64_t)1 << in0_fl);
-        }
-        else if (in0_fl < 0)
-        {
-            input_scale0 = (vx_float32) ((int64_t)1 << -in0_fl);
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        inputZP0    = attr[0]->asymm.zero_point;
-        input_scale0  = attr[0]->asymm.scale;
-    }
-
-    if ( attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        inputZP1 = attr[1]->asymm.zero_point;
-        input_scale1  = attr[1]->asymm.scale;
     }
 
     if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
         out_fl = (int8_t)attr[2]->dfp.fl;
+    }
 
-        if (out_fl >= 0)
-            input_scale0 *= (vx_float32)((int64_t)1 << out_fl);
-        else if (out_fl < 0)
-            input_scale0 *= 1.0f / (vx_float32) ((int64_t)1 << -out_fl);
-    }
-    else if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        out_fl = 1;
-        outputZP      = (float)attr[2]->asymm.zero_point;
-        input_scale0   = input_scale0 / attr[2]->asymm.scale;
-    }
     shift0 = in0_fl - out_fl;
 
     is_2d_img = (out_shape->size < 3) || (out_shape->data[2] == 1);
diff --git a/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c
index efb52f0..8e71126 100644
--- a/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c
@@ -152,7 +152,6 @@ DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer)
     vsi_nn_kernel_tensor_attr_t *output_attr   = NULL;
     vsi_size_array_t * input_shape              = NULL;
     vsi_size_array_t * output_shape             = NULL;
-    int32_t  input_fl = 0, output_fl = 0;
     int32_t  axisSize = 0;
     float    inputScale                        = 1.0f;
     float    input_offset_asymmetric           = 0.0f;
@@ -257,68 +256,19 @@ DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer)
         }
     }
 
-    if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        input_fl = input_attr->dfp.fl;
-        if (input_fl > 0)
-        {
-            inputScale = 1.0f / (float) ((int64_t)1 << input_fl);
-        }
-        else
-        {
-            inputScale = (float)((int64_t)1 << -input_fl);
-        }
-        status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        inputScale              = input_attr->asymm.scale;
-        input_offset_asymmetric = (float)(input_attr->asymm.zero_point);
-        status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else
-    {
-        inputScale              = 1.0f;
-        input_offset_asymmetric = 0;
+    inputScale              = input_attr->scale;
+    input_offset_asymmetric = (float)(input_attr->zero_point);
+    outputScale              = 1.0f / output_attr->scale;
+    output_offset_asymmetric = (float)(output_attr->zero_point);
 
-        status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
+    status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
+    status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
+    status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
+    CHECK_STATUS_FAIL_GOTO(status, final );
 
-    if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        output_fl = output_attr->dfp.fl;
-        if (output_fl > 0)
-        {
-            outputScale = (float) ((int64_t)1 << output_fl);
-        }
-        else
-        {
-            outputScale = 1.0f / (float)((int64_t)1 << -output_fl);
-        }
-        status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        outputScale              = 1.0f / output_attr->asymm.scale;
-        output_offset_asymmetric = (float)(output_attr->asymm.zero_point);
-        status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else
-    {
-        outputScale              = 1.0f;
-        output_offset_asymmetric = 0;
-        status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
     status  = vsi_nn_kernel_gpu_add_param( node, "axisSize", &axisSize );
     CHECK_STATUS_FAIL_GOTO(status, final );
     status = vsi_nn_kernel_gpu_config( node, &gpu_param );
diff --git a/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c
index d9bd40d..aabac06 100644
--- a/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c
@@ -154,7 +154,6 @@ DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer)
     vsi_nn_kernel_tensor_attr_t *output_attr   = NULL;
     vsi_size_array_t * input_shape              = NULL;
     vsi_size_array_t * output_shape             = NULL;
-    int32_t  input_fl = 0, output_fl = 0;
     int32_t  axisSize = 0;
     float    inputScale                        = 1.0f;
     float    input_offset_asymmetric           = 0.0f;
@@ -259,68 +258,18 @@ DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer)
         }
     }
 
-    if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        input_fl = input_attr->dfp.fl;
-        if (input_fl > 0)
-        {
-            inputScale = 1.0f / (float) ((int64_t)1 << input_fl);
-        }
-        else
-        {
-            inputScale = (float)((int64_t)1 << -input_fl);
-        }
-        status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        inputScale              = input_attr->asymm.scale;
-        input_offset_asymmetric = (float)(input_attr->asymm.zero_point);
-        status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else
-    {
-        inputScale              = 1.0f;
-        input_offset_asymmetric = 0;
+    inputScale              = input_attr->scale;
+    input_offset_asymmetric = (float)(input_attr->zero_point);
+    outputScale              = 1.0f / output_attr->scale;
+    output_offset_asymmetric = (float)(output_attr->zero_point);
 
-        status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
+    status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
+    status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
+    CHECK_STATUS_FAIL_GOTO(status, final );
 
-    if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        output_fl = output_attr->dfp.fl;
-        if (output_fl > 0)
-        {
-            outputScale = (float) ((int64_t)1 << output_fl);
-        }
-        else
-        {
-            outputScale = 1.0f / (float)((int64_t)1 << -output_fl);
-        }
-        status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        outputScale              = 1.0f / output_attr->asymm.scale;
-        output_offset_asymmetric = (float)(output_attr->asymm.zero_point);
-        status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else
-    {
-        outputScale              = 1.0f;
-        output_offset_asymmetric = 0;
-        status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
+    status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
+    status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
+    CHECK_STATUS_FAIL_GOTO(status, final );
 
     status  = vsi_nn_kernel_gpu_add_param( node, "axisSize", &axisSize );
     CHECK_STATUS_FAIL_GOTO(status, final );
diff --git a/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c
index 3c710f5..952063a 100644
--- a/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c
@@ -160,7 +160,6 @@ DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer)
     vsi_size_array_t * output_shape             = NULL;
     vsi_nn_kernel_dtype_e src_dtype            = F16;
     vsi_nn_kernel_dtype_e dst_dtype            = F16;
-    int32_t  input_fl = 0, output_fl = 0;
     int32_t  axisSize = 0;
     float    inputScale                        = 1.0f;
     float    input_offset_asymmetric           = 0.0f;
@@ -348,68 +347,17 @@ DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer)
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
 
-    if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        input_fl = input_attr->dfp.fl;
-        if (input_fl > 0)
-        {
-            inputScale = 1.0f / (float) ((int64_t)1 << input_fl);
-        }
-        else
-        {
-            inputScale = (float)((int64_t)1 << -input_fl);
-        }
-        status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        inputScale              = input_attr->asymm.scale;
-        input_offset_asymmetric = (float)(input_attr->asymm.zero_point);
-        status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else
-    {
-        inputScale              = 1.0f;
-        input_offset_asymmetric = 0;
+    inputScale              = input_attr->scale;
+    input_offset_asymmetric = (float)(input_attr->zero_point);
+    status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
+    status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
+    CHECK_STATUS_FAIL_GOTO(status, final );
 
-        status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-
-    if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        output_fl = output_attr->dfp.fl;
-        if (output_fl > 0)
-        {
-            outputScale = (float) ((int64_t)1 << output_fl);
-        }
-        else
-        {
-            outputScale = 1.0f / (float)((int64_t)1 << -output_fl);
-        }
-        status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        outputScale              = 1.0f / output_attr->asymm.scale;
-        output_offset_asymmetric = (float)(output_attr->asymm.zero_point);
-        status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else
-    {
-        outputScale              = 1.0f;
-        output_offset_asymmetric = 0;
-        status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
-        status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
+    outputScale              = 1.0f / output_attr->scale;
+    output_offset_asymmetric = (float)(output_attr->zero_point);
+    status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
+    status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
+    CHECK_STATUS_FAIL_GOTO(status, final );
 
 
     status = vsi_nn_kernel_gpu_config( node, &gpu_param );
diff --git a/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c b/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c
index 1311117..4dcc321 100644
--- a/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c
@@ -138,8 +138,6 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer)
     float                         inputTail      = 0.0f;
     float                         output_ZP      = 0;
     float                         input_ZP       = 0;
-    int32_t                       srcFixPointPos = 0;
-    int32_t                       dstFixPointPos = 0;
 
     VSI_UNREFERENCED(param_size);
 
@@ -154,25 +152,10 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer)
     output_dtype = output_attr->dtype;
     offset       = alpha * threshold;
 
-    if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
-    {
-        srcFixPointPos   = input_attr->dfp.fl;
-    }
-    else if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant)
-    {
-        input_ZP         = (float)(input_attr->asymm.zero_point);
-        scaleIn          = input_attr->asymm.scale;
-    }
-
-    if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
-    {
-        dstFixPointPos   = output_attr->dfp.fl;
-    }
-    else if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant)
-    {
-        output_ZP        = (float)(output_attr->asymm.zero_point);
-        scaleOut         = 1.0f / output_attr->asymm.scale;
-    }
+    input_ZP     = (float)(input_attr->zero_point);
+    scaleIn      = input_attr->scale;
+    output_ZP    = (float)(output_attr->zero_point);
+    scaleOut     = 1.0f / output_attr->scale;
 
     gpu_param.global_scale[0]  = 8;
     gpu_param.global_scale[1]  = 1;
@@ -195,11 +178,6 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer)
     }
     else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
     {
-        if (srcFixPointPos >=0 )
-            scaleIn = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
-        else
-            scaleIn = (float) ((int64_t)1 << -srcFixPointPos);
-
         status = vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn);
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
@@ -212,11 +190,6 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer)
     }
     else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
     {
-        if (dstFixPointPos >=0 )
-            scaleOut = (float) ((int64_t)1 << dstFixPointPos);
-        else
-            scaleOut = 1.0f / (float) ((int64_t)1 << -dstFixPointPos);
-
         status  = vsi_nn_kernel_gpu_add_param(node, "output_scale", &scaleOut);
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c
index 95c33b8..783f5f9 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c
@@ -197,8 +197,6 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer)
     int32_t half_pixel_centers = 0;
 
     uint32_t    depth              = 0;
-    int32_t     srcFixPointPos     = 0;
-    int32_t     dstFixPointPos     = 0;
     float       input_scale        = 1.0;
     int32_t     inputZP            = 0;
     float       output_scale       = 1.0;
@@ -259,53 +257,10 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer)
         half_pixel_value = 0.0f;
     }
 
-    if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
-    {
-        input_scale    = input_attr->asymm.scale;
-        inputZP        = input_attr->asymm.zero_point;
-    }
-    else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
-    {
-        srcFixPointPos   = input_attr->dfp.fl;
-        if (srcFixPointPos >= 0)
-        {
-            input_scale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
-        }
-        else if (srcFixPointPos < 0)
-        {
-            input_scale = (vx_float32)((int64_t)1 << -srcFixPointPos);
-        }
-        inputZP = 0;
-    }
-    else
-    {
-        input_scale = 1.0f;
-        inputZP     = 0;
-    }
-
-    if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
-    {
-        output_scale   = output_attr->asymm.scale;
-        outputZP       = output_attr->asymm.zero_point;
-    }
-    else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
-    {
-        dstFixPointPos = output_attr->dfp.fl;
-        if (dstFixPointPos >= 0)
-        {
-            output_scale = (vx_float32) ((int64_t)1 << dstFixPointPos);
-        }
-        else if (dstFixPointPos < 0)
-        {
-            output_scale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos);
-        }
-        outputZP = 0;
-    }
-    else
-    {
-        output_scale = 1.0;
-        outputZP     = 0;
-    }
+    input_scale    = input_attr->scale;
+    inputZP        = input_attr->zero_point;
+    output_scale   = output_attr->scale;
+    outputZP       = output_attr->zero_point;
 
     if (is_run_nx_kernel)
     {
@@ -473,7 +428,7 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer)
     }
     else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
     {
-        float dfpScale = input_scale * output_scale;
+        float dfpScale = input_scale / output_scale;
         gpu_dp_inst_t uniConvertDFP2FP32_4x4 = {{
             0x01010101, // TCfg
             0x00000000, // ASelt
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c
index fddd1e3..97c83ff 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c
@@ -198,52 +198,19 @@ DEF_KERNEL_INITIALIZER(_resize_1d_nearest_initializer)
         half_pixel_value = 0.0f;
     }
 
-    if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
-    {
-        input_scale    = input_attr->asymm.scale;
-        inputZP        = input_attr->asymm.zero_point;
-    }
-    else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
+    input_scale    = input_attr->scale;
+    inputZP        = input_attr->zero_point;
+    output_scale   = 1.0f / output_attr->scale;
+    outputZP       = output_attr->zero_point;
+
+    if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
     {
         srcFixPointPos   = input_attr->dfp.fl;
-        if (srcFixPointPos >= 0)
-        {
-            input_scale = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
-        }
-        else if (srcFixPointPos < 0)
-        {
-            input_scale = (float)((int64_t)1 << -srcFixPointPos);
-        }
-        inputZP = 0;
-    }
-    else
-    {
-        input_scale = 1.0f;
-        inputZP     = 0;
     }
 
-    if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
-    {
-        output_scale   = 1.0f / output_attr->asymm.scale;
-        outputZP       = output_attr->asymm.zero_point;
-    }
-    else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
+    if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
     {
         dstFixPointPos = output_attr->dfp.fl;
-        if (dstFixPointPos >= 0)
-        {
-            output_scale = (float) ((int64_t)1 << dstFixPointPos);
-        }
-        else if (dstFixPointPos < 0)
-        {
-            output_scale = 1.0f / (float) ((int64_t)1 << -dstFixPointPos);
-        }
-        outputZP = 0;
-    }
-    else
-    {
-        output_scale = 1.0;
-        outputZP     = 0;
     }
 
     if (F16 == input_dtype && F16 == output_dtype)
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
index ebfe9ed..d3d3375 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
@@ -122,12 +122,16 @@ static const _kernel_map_type _resize_bilinear_kernel_map[] =
     PACK_KERNEL_MAP_DOWN(I16, I16),
     PACK_KERNEL_MAP_DOWN(U8, F16),
     PACK_KERNEL_MAP_DOWN(U8, U8),
+    PACK_KERNEL_MAP_DOWN(U16, F16),
+    PACK_KERNEL_MAP_DOWN(U16, U16),
     PACK_KERNEL_MAP_DOWN(F16, F16),
     PACK_KERNEL_MAP_DOWN(F16, U8),
+    PACK_KERNEL_MAP_DOWN(F16, U16),
     PACK_KERNEL_MAP_DOWN(BF16, BF16),
     PACK_KERNEL_MAP_UP(I8, I8),
     PACK_KERNEL_MAP_UP(I16, I16),
     PACK_KERNEL_MAP_UP(U8, U8),
+    PACK_KERNEL_MAP_UP(U16, U16),
     PACK_KERNEL_MAP_UP(F16, F16),
     PACK_KERNEL_MAP_UP(BF16, BF16),
     PACK_KERNEL_MAP_UP_OPT(U8, U8),
@@ -223,8 +227,6 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
     int32_t half_pixel_centers;
 
     uint32_t    depth              = 0;
-    int32_t     srcFixPointPos     = 0;
-    int32_t     dstFixPointPos     = 0;
     float       input_scale        = 1.0;
     int32_t     inputZP            = 0;
     float       output_scale       = 1.0;
@@ -285,201 +287,16 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
         half_pixel_value = 0.0f;
     }
 
-    if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
-    {
-        input_scale    = input_attr->asymm.scale;
-        inputZP        = input_attr->asymm.zero_point;
-    }
-    else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
-    {
-        srcFixPointPos   = input_attr->dfp.fl;
-        if (srcFixPointPos >= 0)
-        {
-            input_scale = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
-        }
-        else if (srcFixPointPos < 0)
-        {
-            input_scale = (float)((int64_t)1 << -srcFixPointPos);
-        }
-        inputZP = 0;
-    }
-    else
-    {
-        input_scale = 1.0f;
-        inputZP     = 0;
-    }
-
-    if (U8 == output_dtype && VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
-    {
-        output_scale   = output_attr->asymm.scale;
-        outputZP       = output_attr->asymm.zero_point;
-    }
-    else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
-    {
-        dstFixPointPos = output_attr->dfp.fl;
-        if (dstFixPointPos >= 0)
-        {
-            output_scale = (float) ((int64_t)1 << dstFixPointPos);
-        }
-        else if (dstFixPointPos < 0)
-        {
-            output_scale = 1.0f / (float) ((int64_t)1 << -dstFixPointPos);
-        }
-        outputZP = 0;
-    }
-    else
-    {
-        output_scale = 1.0;
-        outputZP     = 0;
-    }
+    input_scale    = input_attr->scale;
+    inputZP        = input_attr->zero_point;
+    output_scale   = output_attr->scale;
+    outputZP       = output_attr->zero_point;
 
     gpu_param.global_scale[0] = 4;
     gpu_param.global_scale[1] = 1;
     gpu_param.global_scale[2] = 1;
 
-    if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
-    {
-        float dfpScale = input_scale * output_scale;
-        gpu_dp_inst_t uniConvertDFP2FP32_4x4 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00010000, 0x00030002, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000300, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000000, 0x00000001, 0x00000000,
-            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniExtact8Bit_2x8 = {{
-            0x33333333, // TCfg
-            0x11110000, // ASelt
-            0x03020100, 0x03020100, // ABin
-            0x00000000, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00002400, // AccumType, ConstantType, and PostShift
-            0x00000000, 0x00000000, 0x00000000, 0x00000000,
-            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniRightSubLeft_4x4 = {{
-            0x09090909, // TCfg
-            0x00000000, // ASelt
-            0x00230001, 0x00670045, // ABin
-            0x0a0a0a0a, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0x00010001, 0x00000000, 0x00010001, 0x00000000,
-            0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16};
-        gpu_dp_inst_t uniDFPtoFp32_left_4x4 = {{
-            0x01010101, // TCfg
-            0x00000000, // ASelt
-            0x00020000, 0x00060004, // ABin
-            0x02020202, // BSelt
-            0x00000000, 0x00000000, // BBin
-            0x00000400, // AccumType, ConstantType, and PostShift
-            0x00000001, 0x00000000, 0x00000001, 0x00000000,
-            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
-        }, GPU_DP_TYPE_16};
-
-        if (I8 == input_dtype && I8 == output_dtype && out_width > in_width)
-        {
-            gpu_dp_inst_t uniConvertI32toI16_2x8 = {{
-                0x33333333, // TCfg
-                0x11110000, // ASelt
-                0x03020100, 0x03020100, // ABin
-                0x00000000, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00002400, // AccumType, ConstantType, and PostShift
-                0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16};
-            gpu_dp_inst_t uniGetMaskShift_2x8 = {{
-                0x99999999, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x55555555, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000400, // AccumType, ConstantType, and PostShift
-                0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16};
-            gpu_dp_inst_t uniConvertDFP2FP32_part1_4x4 = {{
-                0x09090909, // TCfg
-                0x00000000, // ASelt
-                0x00150004, 0x00370026, // ABin
-                0x0a0a0a0a, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000300, // AccumType, ConstantType, and PostShift
-                0x00010001, 0x00000000, 0x00010001, 0x00000000,
-                0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16};
-
-            status  = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8);
-            status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8);
-            status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniConvertDFP2FP32_4x4);
-            status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4",
-                                                 &uniConvertDFP2FP32_part1_4x4);
-            status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth);
-            CHECK_STATUS_FAIL_GOTO(status, final );
-
-            gpu_param.global_scale[2] = depth;
-        }
-        else if (I16 == input_dtype && I16 == output_dtype && out_width > in_width)
-        {
-            gpu_dp_inst_t uniConvertI32toI16_2x8 = {{
-                0x33333333, // TCfg
-                0x11110000, // ASelt
-                0x03020100, 0x03020100, // ABin
-                0x00000000, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00002400, // AccumType, ConstantType, and PostShift
-                0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16};
-            gpu_dp_inst_t uniGetMaskShift_2x8 = {{
-                0x99999999, // TCfg
-                0x00000000, // ASelt
-                0x03020100, 0x07060504, // ABin
-                0x55555555, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000400, // AccumType, ConstantType, and PostShift
-                0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16};
-            gpu_dp_inst_t uniConvertDFP2FP32_part1_4x4 = {{
-                0x09090909, // TCfg
-                0x00000000, // ASelt
-                0x00150004, 0x00370026, // ABin
-                0x0a0a0a0a, // BSelt
-                0x00000000, 0x00000000, // BBin
-                0x00000300, // AccumType, ConstantType, and PostShift
-                0x00010001, 0x00000000, 0x00010001, 0x00000000,
-                0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
-            }, GPU_DP_TYPE_16};
-
-            status  = vsi_nn_kernel_gpu_add_param( node, "uniConvertI32toI16_2x8", &uniConvertI32toI16_2x8);
-            status |= vsi_nn_kernel_gpu_add_param( node, "uniGetMaskShift_2x8", &uniGetMaskShift_2x8);
-            status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniConvertDFP2FP32_4x4);
-            status |= vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4",
-                                                 &uniConvertDFP2FP32_part1_4x4);
-            status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth);
-            CHECK_STATUS_FAIL_GOTO(status, final );
-
-            gpu_param.global_scale[2] = depth;
-        }
-        else
-        {
-            status  = vsi_nn_kernel_gpu_add_param( node, "uniRightSubLeft_4x4", &uniRightSubLeft_4x4);
-            status |= vsi_nn_kernel_gpu_add_param( node, "uniDFPtoFp32_left_4x4", &uniDFPtoFp32_left_4x4);
-            CHECK_STATUS_FAIL_GOTO(status, final );
-        }
-
-        status  = vsi_nn_kernel_gpu_add_param( node, "uniExtact8Bit_2x8", &uniExtact8Bit_2x8);
-        status |= vsi_nn_kernel_gpu_add_param( node, "scale_xy", scale_factor);
-        status |= vsi_nn_kernel_gpu_add_param( node, "dfpScale", &dfpScale);
-        CHECK_STATUS_FAIL_GOTO(status, final );
-    }
-    else if (U8 == input_dtype && (U8 == output_dtype || F16 == output_dtype))
+    if ((U8 == input_dtype || U16 == input_dtype || I8 == input_dtype || I16 == input_dtype))
     {
         float   uint8Scale             = input_scale / output_scale;
         float   uint8ZP_out            = (float)outputZP;
@@ -615,7 +432,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer)
         }
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
-    else if (F16 == input_dtype && (U8 == output_dtype || F16 == output_dtype))
+    else if (F16 == input_dtype && (U8 == output_dtype || F16 == output_dtype || U16 == output_dtype))
     {
         float   uint8Scale             = 1.0f / output_scale;
         float   uint8ZP_out            = (float)outputZP;
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_cubic_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_cubic_evis.c
new file mode 100644
index 0000000..618b33f
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/resize_cubic_evis.c
@@ -0,0 +1,453 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+#define _RESIZE_CUBIC_KERNEL_SOURCE()      "resize_cubic"
+
+#define STR(a) #a
+// Add kernel hashtable here
+#define RESIZE_CUBIC_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
+        (( IN_DTYPE << 20 ) | ( OUT_DTYPE << 8) )
+
+#define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+        { RESIZE_CUBIC_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
+          CVIVANTE_NAMESPACE("evis.resize_cubic_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)), \
+          _RESIZE_CUBIC_KERNEL_SOURCE() }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _resize_cubic_kernel_map[] =
+{
+    PACK_KERNEL_MAP(F16, F16),
+    PACK_KERNEL_MAP(I16, I16),
+    PACK_KERNEL_MAP(F16, I16),
+    PACK_KERNEL_MAP(I16, F16),
+    PACK_KERNEL_MAP(I8,  I8),
+    PACK_KERNEL_MAP(F16, I8),
+    PACK_KERNEL_MAP(I8,  F16),
+    PACK_KERNEL_MAP(U8,  U8),
+    PACK_KERNEL_MAP(F16, U8),
+    PACK_KERNEL_MAP(U8,  F16),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _resize_cubic_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+};
+
+#define RESIZE_CUBIC_NUM   _cnt_of_array( _resize_cubic_kernel_param_def )
+
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_resize_cubic_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t *input_attr     = NULL;
+    vsi_nn_kernel_tensor_attr_t *output_attr    = NULL;
+    vsi_size_array_t * out_shape                = NULL;
+
+    float       input_scale        = 1.0;
+    float       input_tail         = 0;
+    float       output_scale       = 1.0;
+    float       output_tail        = 0;
+
+    VSI_UNREFERENCED(param_size);
+
+    input_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0]);
+    CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+    output_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1]);
+    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+
+    out_shape  = output_attr->shape;
+
+    if ( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        int32_t fl = input_attr->dfp.fl;
+        if (fl > 0)
+        {
+            input_scale = 1.0f / (float) ((int64_t)1 << fl);
+        }
+        else
+        {
+            input_scale = (float)((int64_t)1 << -fl);
+        }
+    }
+    else if ( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    {
+        input_scale = input_attr->asymm.scale;
+        input_tail  = 0 - input_scale * (float)input_attr->asymm.zero_point;
+    }
+
+    if ( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
+    {
+        int32_t fl = output_attr->dfp.fl;
+        if (fl > 0)
+        {
+            output_scale = (float) ((int64_t)1 << fl);
+        }
+        else
+        {
+            output_scale = 1.0f / (float)((int64_t)1 << -fl);
+        }
+    }
+    else if ( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
+    {
+        output_scale = 1.0f / output_attr->asymm.scale;
+        output_tail    = (float)output_attr->asymm.zero_point;
+    }
+
+    gpu_param.global_scale[0]  = 4;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    {
+        gpu_dp_inst_t uniFp16ToFp32_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000400, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+        gpu_dp_inst_t uniExtract8Bit_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+        gpu_dp_inst_t uniExtractHalf8_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        }, GPU_DP_TYPE_16};
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniFp16ToFp32_4x4", &uniFp16ToFp32_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniExtractHalf8_2x8", &uniExtractHalf8_2x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniExtract8Bit_2x8", &uniExtract8Bit_2x8);
+    }
+    status |= vsi_nn_kernel_gpu_add_param( node, "input_scale", &input_scale);
+    status |= vsi_nn_kernel_gpu_add_param( node, "input_tail", &input_tail);
+    status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale);
+    status |= vsi_nn_kernel_gpu_add_param( node, "output_tail", &output_tail);
+
+
+    gpu_param.global_size[0] = gpu_align_p2(
+            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr );
+    if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr );
+    return status;
+} /* _resize_cubic_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _resize_cubic_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _resize_cubic_kernel_map );
+    vx_param_description_t * param_def  = _resize_cubic_kernel_param_def;
+    size_t param_def_size               = RESIZE_CUBIC_NUM;
+    vx_kernel_initialize_f  initializer = _resize_cubic_initializer;
+
+    uint32_t key = 0;
+    uint32_t i = 0;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = RESIZE_CUBIC_HASH_KEY( in_dtype, out_dtype );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_def_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+static vsi_nn_tensor_t* _create_scale_tensor
+    (
+    vsi_nn_graph_t  *graph,
+    vsi_size_t       output_size,
+    float            scale_factor,
+    float            half_pixel_value,
+    vsi_nn_tensor_t** index
+    )
+{
+    vsi_nn_tensor_attr_t attr;
+    vsi_nn_tensor_t*  scale           = NULL;
+    vsi_size_t   i                    = 0;
+    float       *scale_data_ptr       = NULL;
+    int         *index_data_ptr       = NULL;
+    float        scale_value          = 0;
+    vsi_ssize_t  data                 = 0;
+    int          idx                  = 0;
+    float        delta_v              = 0;
+    float        cubic_coeff_a        = -0.5f;
+    vsi_size_t   item_count           = 4 * output_size;
+    scale_data_ptr = (float *)malloc(item_count * sizeof(float));
+    if (scale_data_ptr == NULL)
+    {
+        VSILOGE("allocate memory fail at function %s line %d", __FUNCTION__, __LINE__);
+        goto OnError;
+    }
+
+    index_data_ptr = (int *)malloc(output_size * sizeof(int));
+    if (index_data_ptr == NULL)
+    {
+        VSILOGE("allocate memory fail at function %s line %d", __FUNCTION__, __LINE__);
+        goto OnError;
+    }
+
+    for (i = 0; i < output_size; i ++)
+    {
+        scale_value = ((float)i + half_pixel_value) * scale_factor - half_pixel_value;
+        data = (vsi_ssize_t)scale_value;
+        delta_v = scale_value - (float)data;
+        idx   = (int)data - 1;
+
+        index_data_ptr[i] = idx;
+        scale_data_ptr[i * 4 + 0] = cubic_coeff_a * (((delta_v - 4) * (delta_v + 1) + 8) * (delta_v + 1) - 4);
+        scale_data_ptr[i * 4 + 1] = ((cubic_coeff_a + 2) * delta_v - (cubic_coeff_a + 3)) * delta_v *delta_v + 1;
+        scale_data_ptr[i * 4 + 2] = ((cubic_coeff_a + 2) * (1 - delta_v) - (cubic_coeff_a + 3))
+                                  * (1 - delta_v) * (1 - delta_v) + 1;
+        scale_data_ptr[i * 4 + 3] = cubic_coeff_a * ((( 2 - delta_v - 5) * (2 - delta_v) + 8) * (2 - delta_v) - 4);
+    }
+    attr.size[0] = item_count;
+    attr.dim_num = 1;
+    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+    attr.vtl = FALSE;
+
+    scale = vsi_nn_CreateTensorFromData(graph, (uint8_t *)scale_data_ptr, &attr);
+    if (scale_data_ptr)
+    {
+        free (scale_data_ptr);
+        scale_data_ptr = NULL;
+    }
+
+    attr.size[0] = output_size;
+    attr.dim_num = 1;
+    attr.dtype.vx_type = VSI_NN_TYPE_INT32;
+    attr.vtl = FALSE;
+
+    *index = vsi_nn_CreateTensorFromData(graph, (uint8_t *)index_data_ptr, &attr);
+    if (index_data_ptr)
+    {
+        free (index_data_ptr);
+        index_data_ptr = NULL;
+    }
+
+OnError:
+    return scale;
+}
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[RESIZE_CUBIC_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
+    int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
+    vsi_size_t in_width     = inputs[0]->attr.size[0];
+    vsi_size_t in_height    = inputs[0]->attr.size[1];
+    vsi_size_t out_width    = outputs[0]->attr.size[0];
+    vsi_size_t out_height   = outputs[0]->attr.size[1];
+    float   half_pixel_value = 0.0f;
+    float   width_scale = 0.0f;
+    float   height_scale = 0.0f;
+    vsi_nn_tensor_t* scale_w = NULL;
+    vsi_nn_tensor_t* scale_h = NULL;
+    vsi_nn_tensor_t* index_w = NULL;
+    vsi_nn_tensor_t* index_h = NULL;
+
+    if (align_corners && out_width > 1)
+    {
+        width_scale = ((vx_float32)(in_width - 1) * 1.0f) / (vx_float32)(out_width - 1);
+    }
+    else
+    {
+        width_scale = ((vx_float32)in_width * 1.0f) / (vx_float32)out_width;
+    }
+
+    if (align_corners && out_height > 1)
+    {
+        height_scale = ((vx_float32)(in_height - 1) * 1.0f) / (vx_float32)(out_height - 1);
+    }
+    else
+    {
+        height_scale = ((vx_float32)in_height * 1.0f) / (vx_float32)out_height;
+    }
+
+    if (half_pixel_centers)
+    {
+        half_pixel_value = 0.5f;
+    }
+    else
+    {
+        half_pixel_value = 0.0f;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            size_t node_params_num = RESIZE_CUBIC_NUM;
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, RESIZE_CUBIC_NUM,
+                    inputs, input_num, outputs, output_num );
+            scale_w = _create_scale_tensor(graph, out_width,\
+                              width_scale, half_pixel_value, &index_w);
+            CHECK_PTR_FAIL_GOTO( scale_w, "Create buffer fail.", final );
+            CHECK_PTR_FAIL_GOTO( index_w, "Create buffer fail.", final );
+            scale_h = _create_scale_tensor(graph, out_height,\
+                              height_scale, half_pixel_value, &index_h);
+            CHECK_PTR_FAIL_GOTO( scale_h, "Create buffer fail.", final );
+            CHECK_PTR_FAIL_GOTO( index_h, "Create buffer fail.", final );
+            node_params[2] = (vsi_nn_kernel_node_param_t)(scale_w->t);
+            node_params[3] = (vsi_nn_kernel_node_param_t)(scale_h->t);
+            node_params[4] = (vsi_nn_kernel_node_param_t)(index_w->t);
+            node_params[5] = (vsi_nn_kernel_node_param_t)(index_h->t);
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
+            VSI_ASSERT( status == VSI_SUCCESS );
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+        }
+    }
+
+final:
+    vsi_safe_release_tensor(scale_w);
+    vsi_safe_release_tensor(scale_h);
+    vsi_safe_release_tensor(index_w);
+    vsi_safe_release_tensor(index_h);
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( resize_cubic, _setup )
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c
index 6bf9ba8..99312c1 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c
@@ -208,52 +208,19 @@ DEF_KERNEL_INITIALIZER(_resize_nearest_initializer)
         half_pixel_value = 0.0f;
     }
 
-    if (U8 == input_dtype && VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant )
-    {
-        input_scale    = input_attr->asymm.scale;
-        inputZP        = input_attr->asymm.zero_point;
-    }
-    else if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
+    input_scale    = input_attr->scale;
+    inputZP        = input_attr->zero_point;
+    output_scale   = 1.0f / output_attr->scale;
+    outputZP       = output_attr->zero_point;
+
+    if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
     {
         srcFixPointPos   = input_attr->dfp.fl;
-        if (srcFixPointPos >= 0)
-        {
-            input_scale = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
-        }
-        else if (srcFixPointPos < 0)
-        {
-            input_scale = (float)((int64_t)1 << -srcFixPointPos);
-        }
-        inputZP = 0;
-    }
-    else
-    {
-        input_scale = 1.0f;
-        inputZP     = 0;
     }
 
-    if (U8 == output_dtype && VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant )
-    {
-        output_scale   = 1.0f / output_attr->asymm.scale;
-        outputZP       = output_attr->asymm.zero_point;
-    }
-    else if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
+    if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
     {
         dstFixPointPos = output_attr->dfp.fl;
-        if (dstFixPointPos >= 0)
-        {
-            output_scale = (float) ((int64_t)1 << dstFixPointPos);
-        }
-        else if (dstFixPointPos < 0)
-        {
-            output_scale = 1.0f / (float) ((int64_t)1 << -dstFixPointPos);
-        }
-        outputZP = 0;
-    }
-    else
-    {
-        output_scale = 1.0;
-        outputZP     = 0;
     }
 
     if (F16 == input_dtype && F16 == output_dtype)
diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
index bba21ea..e52d396 100644
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c
@@ -208,10 +208,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_initializer)
     height     = (int32_t)(attr[2]->shape->data[1]);
     index_num  = (int32_t)(attr[0]->shape->data[1]);
 
-    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        output_zp = attr[2]->asymm.zero_point;
-    }
+    output_zp = attr[2]->zero_point;
 
     if (coord_dim == 3)
     {
@@ -367,10 +364,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_big_initializer)
     height     = (int32_t)(attr[2]->shape->data[1]);
     index_num  = (int32_t)(attr[0]->shape->data[1]);
 
-    if ( attr[2]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        output_zp = attr[2]->asymm.zero_point;
-    }
+    output_zp = attr[2]->zero_point;
 
     if (coord_dim == 3)
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
index b59bccf..4786abb 100644
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
@@ -382,6 +382,12 @@ static vsi_status check_scatter_nd_update_index_repeat
     int32_t* mask_buffer = NULL;
     int32_t  mask_len = 0;
 
+    if (indices_num == 1)
+    {
+        isRepeat[0] = 0;
+        return VSI_SUCCESS;
+    }
+
     if (inputs[1]->attr.is_const == FALSE)
     {
         isRepeat[0] = 1;
@@ -451,7 +457,7 @@ static vsi_status check_scatter_nd_update_index_repeat
             else if (mask_buffer[mask_idx] > 0)
             {
                 isRepeat[0] = 1;
-                status = VSI_FAILURE;
+                status = VSI_SUCCESS;
                 CHECK_STATUS_FAIL_GOTO( status, final );
             }
         }
diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_reduction_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_reduction_evis.c
new file mode 100644
index 0000000..f2ffdc0
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_reduction_evis.c
@@ -0,0 +1,861 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+typedef enum
+{
+    NONE = 0,
+    Add,
+    Mul,
+    Max,
+    Min
+} vsi_scatter_nd_update_type_e;
+
+/*
+ * Define kernel meta.
+ */
+#define KERNEL_SOURCE_1    "scatter_nd_update_reduction"
+#define KERNEL_SOURCE_2    "scatter_nd_update_reduction_conv"
+
+#define HASH_SCATTER_ND_UPDATE_KEY(_in0_type, _in2_type, _out_type, _stage, _op_type) \
+    ((_in0_type << 24) | (_in2_type << 16) | (_out_type << 8) | (_stage << 4) | (_op_type))
+
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PREPROCESS_NAME(SRC0_TYPE) \
+    CVIVANTE_NAMESPACE("evis.scatter_nd_update_reduction_preprocess_"#SRC0_TYPE)
+
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_PROCESS_NAME(REDUCTION_TYPE, SRC2_TYPE) \
+    CVIVANTE_NAMESPACE("evis.scatter_nd_update_reduction_"#REDUCTION_TYPE"_"#SRC2_TYPE)
+
+#define HASH_SCATTER_ND_UPDATE_SH_KERNEL_CONV_NAME(DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.scatter_nd_update_reduction_conv_"#DST_TYPE)
+
+#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(IN0_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(IN0_TYPE, 0, 0, 0, 0), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_PREPROCESS_NAME(IN0_TYPE), \
+        SOURCE },
+
+#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(REDUCTION_TYPE, IN2_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(0, IN2_TYPE, 0, 1, REDUCTION_TYPE), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_PROCESS_NAME(REDUCTION_TYPE, IN2_TYPE), \
+        SOURCE },
+
+#define TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(OUT_TYPE, SOURCE) \
+    { HASH_SCATTER_ND_UPDATE_KEY(0, 0, OUT_TYPE, 2, 0), \
+        HASH_SCATTER_ND_UPDATE_SH_KERNEL_CONV_NAME(OUT_TYPE), \
+        SOURCE },
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type scatter_nd_update_reduction_preprocess_map[] =
+{
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(U8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(I8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(I16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PREPROCESS_KERNELS(F16,  KERNEL_SOURCE_1)
+};
+
+static const _kernel_map_type scatter_nd_update_reduction_process_map[] =
+{
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, U8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, U8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, U8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, U8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, I8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, I8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, I8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, I8,   KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, I16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, I16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, I16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, I16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, F16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, F16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, F16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, F16,  KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Add, BF16, KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Mul, BF16, KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Max, BF16, KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_PROCESS_KERNELS(Min, BF16, KERNEL_SOURCE_1)
+};
+
+static const _kernel_map_type scatter_nd_update_reduction_conv_map[] =
+{
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(U8,   KERNEL_SOURCE_2)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(I8,   KERNEL_SOURCE_2)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(I16,  KERNEL_SOURCE_2)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(F16,  KERNEL_SOURCE_2)
+    TENSOR_SCATTER_ND_UPDATE_REDUCTION_CONV_KERNELS(BF16, KERNEL_SOURCE_2)
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _scatter_nd_update_preprocess_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+
+static vx_param_description_t _scatter_nd_update_process_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+
+static vx_param_description_t _scatter_nd_update_conv_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+
+#define _SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM  _cnt_of_array(_scatter_nd_update_preprocess_kernel_param_def)
+#define _SCATTER_ND_UPDATE_PROCESS_PARAM_NUM  _cnt_of_array(_scatter_nd_update_process_kernel_param_def)
+#define _SCATTER_ND_UPDATE_CONV_PARAM_NUM  _cnt_of_array(_scatter_nd_update_conv_kernel_param_def)
+
+static vsi_status get_scatter_nd_update_tensor_reshape_size
+    (
+    vsi_nn_tensor_t ** inputs,
+    vsi_size_t sizes[VSI_NN_MAX_DIM_NUM],
+    uint32_t block_size,
+    uint32_t coordDim,
+    vsi_size_t strides[VSI_NN_MAX_DIM_NUM],
+    int32_t* newDim
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    uint32_t dims_num = inputs[0]->attr.dim_num;
+    vsi_size_t *input_size = inputs[0]->attr.size;
+    uint32_t i = 0;
+    vsi_size_t elementCnt = 1;
+
+#define VSI_NN_MAX_IMAGE_WIDTH  GPU_TENSOR_MAX_WIDTH
+
+    newDim[0] = 0;
+    for (i = 0; i < dims_num; ++i)
+    {
+        elementCnt *= input_size[i];
+    }
+
+    for (i = 0; i < VSI_NN_MAX_DIM_NUM; ++i)
+    {
+        sizes[i] = 1;
+    }
+
+    sizes[0] = block_size;
+    sizes[1] = elementCnt / block_size;
+    newDim[0] = 2;
+
+    if (coordDim == 1 && strides) // index shape
+    {
+        for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+        {
+            strides[i] = 0;
+        }
+    }
+    else if (coordDim >= 2 && coordDim <= VSI_NN_MAX_DIM_NUM && strides)
+    {
+        for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+        {
+            strides[i] = 0;
+        }
+
+        strides[0] = input_size[dims_num - coordDim];
+        for (i = 1; i < coordDim - 1; i++)
+        {
+            strides[i] = strides[i - 1] * input_size[dims_num - coordDim + i];
+        }
+    }
+
+#undef VSI_NN_MAX_IMAGE_WIDTH
+
+    return status;
+} /* _get_EltOP_tensor_reshape_size */
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_preprocess_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        1,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    int32_t     width         = 0;
+    int32_t     element_size  = 1;
+    int32_t     input_zp0     = 0;
+    float       input_scale0  = 1;
+    int32_t     i             = 0;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+
+    for (i = 0; i < (int32_t)attr[0]->shape->size; i++)
+    {
+        element_size *= (int32_t)attr[0]->shape->data[i];
+    }
+    width = element_size / 8;
+
+    input_zp0     = attr[0]->zero_point;
+    input_scale0  = attr[0]->scale;
+
+    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
+    {
+        input_scale0 = 1.0f;
+    }
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+    if (element_size < 8)
+    {
+        gpu_param.global_size[0]   = element_size;
+    }
+    else
+    {
+        gpu_param.global_size[0]   = width;
+    }
+    gpu_param.global_size[1]   = 1;
+    gpu_param.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
+                0x05050505, // TCfg
+                0x04040404, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0xffff0001, 0x00000000, 0xffff0001, 0x00000000,
+                0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvert2ndU8SubZpToFp32_4x4 = {{
+                0x09090909, // TCfg
+                0x04040404, // ASelt
+                0x00050004, 0x00070006, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000300, // AccumType, ConstantType, and PostShift
+                0x00010001, 0x00000000, 0x00010001, 0x00000000,
+                0x00010001, 0x00000000, 0x00010001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvert1stUint8SubZpToFp32_4x4", &uniConvert1stUint8SubZpToFp32_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvert2ndU8SubZpToFp32_4x4", &uniConvert2ndU8SubZpToFp32_4x4 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "input_scale", &input_scale0 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "input_zp", &input_zp0 );
+        CHECK_STATUS_FAIL_GOTO(status, OnError);
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _scatter_nd_update_preprocess_initializer() */
+
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_process_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        2,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL };
+    int32_t     block_size = 1;
+    int32_t     update_width = 1;
+    int32_t     index_num  = 1;
+    int32_t     width = 0;
+    int32_t     coord_dim  = 0;
+    int32_t     strides[VSI_NN_MAX_DIM_NUM] = {0};
+    int32_t     coord_strides[8]  = {0};
+    int32_t     coord_strides1[4] = {0};
+    int32_t     input_zp2     = 0;
+    float       input_scale2  = 1;
+    int32_t     i = 0;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &strides[0]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &strides[1]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &strides[2]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &strides[3]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &strides[4]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &strides[5]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &strides[6]);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &coord_dim);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    block_size   = (int32_t)(attr[2]->shape->data[0]);
+    update_width = (int32_t)(attr[1]->shape->data[0]);
+    index_num    = (int32_t)(attr[0]->shape->data[1]);
+    width = block_size;
+
+    input_zp2     = attr[1]->zero_point;
+    input_scale2  = attr[1]->scale;
+
+    coord_strides[coord_dim - 1] = 1;
+    for (i = 0; i < coord_dim - 1; i++)
+    {
+        coord_strides[i] = strides[coord_dim - 2 - i];
+    }
+    memcpy(coord_strides1, coord_strides + 4, 4 * sizeof(int32_t));
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.global_size[0]   = width;
+    gpu_param.global_size[1]   = index_num;
+    gpu_param.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        gpu_dp_inst_t uniConvert1stUint8SubZpToFp32_4x4 = {{
+                0x05050505, // TCfg
+                0x04040404, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0xffff0001, 0x00000000, 0xffff0001, 0x00000000,
+                0xffff0001, 0x00000000, 0xffff0001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniConvBF16toF32_Part0_2x8 = {{
+            0x11111111, // TCfg
+            0x01010101, // ASelt
+            0x01050004, 0x03070206, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status = vsi_nn_kernel_gpu_add_param( node, "update_width", &update_width );
+        status |= vsi_nn_kernel_gpu_add_param( node, "output_width", &block_size );
+        status |= vsi_nn_kernel_gpu_add_param( node, "coord_stride", &coord_strides );
+        status |= vsi_nn_kernel_gpu_add_param( node, "coord_stride1", &coord_strides1 );
+        CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+        if (attr[1]->dtype == U8 || attr[1]->dtype == I8 || attr[1]->dtype == I16)
+        {
+            status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvert1stUint8SubZpToFp32_4x4",  &uniConvert1stUint8SubZpToFp32_4x4 );
+            status |= vsi_nn_kernel_gpu_add_param( node, "update_scale", &input_scale2 );
+            status |= vsi_nn_kernel_gpu_add_param( node, "update_zp", &input_zp2 );
+            CHECK_STATUS_FAIL_GOTO(status, OnError );
+        }
+        else if (attr[1]->dtype == BF16)
+        {
+            status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvBF16toF32_Part0_2x8",  &uniConvBF16toF32_Part0_2x8 );
+            CHECK_STATUS_FAIL_GOTO(status, OnError );
+        }
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+    if (attr[2])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[2] );
+        attr[2] = NULL;
+    }
+    return status;
+} /* _scatter_nd_update_process_initializer() */
+
+DEF_KERNEL_INITIALIZER(_scatter_nd_update_conv_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        1,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    int32_t     width         = 0;
+    int32_t     element_size  = 1;
+    int32_t     i             = 0;
+    float       output_zp     = 0;
+    float       output_scale  = 1.0f;
+
+    VSI_UNREFERENCED(param_size);
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+
+    output_zp     = (float)attr[0]->zero_point;
+    output_scale  = (float)1.0 / attr[0]->scale;
+
+    for (i = 0; i < (int32_t)attr[0]->shape->size; i++)
+    {
+        element_size *= (int32_t)attr[0]->shape->data[i];
+    }
+    width = element_size / 8;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+    if (element_size < 8)
+    {
+        gpu_param.global_size[0]   = element_size;
+    }
+    else
+    {
+        gpu_param.global_size[0]   = width;
+    }
+    gpu_param.global_size[1]   = 1;
+    gpu_param.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+            0x33333333, // TCfg
+            0x11110000, // ASelt
+            0x03020100, 0x03020100, // ABin
+            0x00000000, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002400, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractHalf8_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000100, // AccumType, ConstantType, and PostShift
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+            0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractOddData_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x07050301, 0x07050301, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status = vsi_nn_kernel_gpu_add_param( node,
+                    "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniExtractHalf8_2x8", &uniExtractHalf8_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node,
+                    "uniExtractOddData_2x8", &uniExtractOddData_2x8 );
+        status |= vsi_nn_kernel_gpu_add_param( node, "output_zp", &output_zp );
+        status |= vsi_nn_kernel_gpu_add_param( node, "output_scale", &output_scale );
+        CHECK_STATUS_FAIL_GOTO(status, OnError);
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _scatter_nd_update_conv_initializer() */
+
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel_preprocess,
+    vsi_nn_kernel_t* kernel_process,
+    vsi_nn_kernel_t* kernel_conv,
+    int32_t reduction_flg
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e input2_dtype = F16;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    uint32_t key = 0;
+    size_t i = 0;
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    input2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = HASH_SCATTER_ND_UPDATE_KEY(input0_dtype, 0, 0, 0, 0);
+
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_preprocess_map); i ++ )
+    {
+        if ( scatter_nd_update_reduction_preprocess_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < _cnt_of_array(scatter_nd_update_reduction_preprocess_map) )
+    {
+        snprintf( kernel_preprocess->info.name, VX_MAX_KERNEL_NAME, "%s",
+                        scatter_nd_update_reduction_preprocess_map[i].function_name );
+        kernel_preprocess->info.parameters = _scatter_nd_update_preprocess_kernel_param_def;
+        kernel_preprocess->info.numParams = _SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM;
+        kernel_preprocess->info.initialize = _scatter_nd_update_preprocess_initializer;
+
+        vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                scatter_nd_update_reduction_preprocess_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel_preprocess, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                scatter_nd_update_reduction_preprocess_map[i].source_name );
+    }
+    else
+    {
+        status = VSI_FAILURE;
+    }
+
+    key = HASH_SCATTER_ND_UPDATE_KEY( 0, input2_dtype, 0, 1, reduction_flg);
+
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_process_map); i ++ )
+    {
+        if ( scatter_nd_update_reduction_process_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(scatter_nd_update_reduction_process_map) )
+    {
+        snprintf( kernel_process->info.name, VX_MAX_KERNEL_NAME, "%s",
+                        scatter_nd_update_reduction_process_map[i].function_name );
+        kernel_process->info.parameters = _scatter_nd_update_process_kernel_param_def;
+        kernel_process->info.numParams = _SCATTER_ND_UPDATE_PROCESS_PARAM_NUM;
+        kernel_process->info.initialize = _scatter_nd_update_process_initializer;
+
+        vsi_nn_kernel_add_source( kernel_process, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                scatter_nd_update_reduction_process_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel_process, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                scatter_nd_update_reduction_process_map[i].source_name );
+    }
+    else
+    {
+        status |= VSI_FAILURE;
+    }
+
+    key = HASH_SCATTER_ND_UPDATE_KEY( 0, 0, output_dtype, 2, 0);
+
+    for ( i = 0; i < _cnt_of_array(scatter_nd_update_reduction_conv_map); i ++ )
+    {
+        if ( scatter_nd_update_reduction_conv_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(scatter_nd_update_reduction_conv_map) )
+    {
+        snprintf( kernel_conv->info.name, VX_MAX_KERNEL_NAME, "%s",
+                        scatter_nd_update_reduction_conv_map[i].function_name );
+        kernel_conv->info.parameters = _scatter_nd_update_conv_kernel_param_def;
+        kernel_conv->info.numParams = _SCATTER_ND_UPDATE_CONV_PARAM_NUM;
+        kernel_conv->info.initialize = _scatter_nd_update_conv_initializer;
+
+        vsi_nn_kernel_add_source( kernel_conv, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                scatter_nd_update_reduction_conv_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel_conv, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                scatter_nd_update_reduction_conv_map[i].source_name );
+    }
+    else
+    {
+        status |= VSI_FAILURE;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_size_t  shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}};
+    vsi_size_t  strides[VSI_NN_MAX_DIM_NUM] = {0};
+    int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
+    int32_t coord_dim   = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
+    int32_t reduction  = vsi_nn_kernel_param_get_int32( params, "reduction" );
+    int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
+    int32_t i = 0;
+    vsi_nn_tensor_t * tensors[2] = { NULL };
+    vsi_nn_kernel_t * ikernels[2] = { NULL };
+
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
+    status = get_scatter_nd_update_tensor_reshape_size(&inputs[1], shapes[0], coord_dim, 0,
+                                                    NULL, &rs_idx_dim);
+    status |= get_scatter_nd_update_tensor_reshape_size(&inputs[2], shapes[1], block_size, 0,
+                                                    NULL, &rs_in_dim);
+    status |= get_scatter_nd_update_tensor_reshape_size(&outputs[0], shapes[2], block_size, coord_dim,
+                                                    strides, &rs_out_dim);
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+    {
+        vsi_nn_tensor_attr_t attr;
+        vsi_nn_kernel_node_t preprocess_node = NULL;
+        vsi_nn_kernel_node_t process_node = NULL;
+        vsi_nn_kernel_node_param_t preprocess_params[_SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM] = { NULL };
+        vsi_nn_kernel_node_param_t process_params[_SCATTER_ND_UPDATE_PROCESS_PARAM_NUM] = { NULL };
+        vsi_nn_kernel_node_param_t conv_params[_SCATTER_ND_UPDATE_CONV_PARAM_NUM] = { NULL };
+        int32_t width = 1;
+        int32_t res = 0;
+
+        ikernels[0] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
+        ikernels[0]->unique_id = kernel->unique_id;
+        ikernels[1] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_EVIS );
+        ikernels[1]->unique_id = kernel->unique_id;
+
+        memset( &attr, 0, sizeof(vsi_nn_tensor_attr_t) );
+        attr.dtype = outputs[0]->attr.dtype;
+        attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+        attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+        attr.is_const = FALSE;
+        attr.vtl = TRUE;
+
+        for (i = 0; i < rs_out_dim; i++)
+        {
+            attr.size[i] = shapes[2][i];
+            width *= (int32_t)shapes[2][i];
+        }
+        attr.dim_num = rs_out_dim;
+
+        res = width % 8;
+        width = (width >> 3) << 3;
+
+        tensors[0] = vsi_nn_CreateTensor( graph, &attr );  // ref'
+        attr.size[0] = 1;
+        attr.size[1] = 1;
+        attr.dim_num = rs_out_dim;
+        tensors[1] = vsi_nn_CreateTensor( graph, &attr );  // link_buffer0
+
+        status = _query_kernel( inputs, outputs, ikernels[0], ikernels[1], kernel, reduction);
+        if ( VSI_SUCCESS == status)
+        {
+            // convert ref to float
+            preprocess_node = vsi_nn_kernel_create_node( graph, ikernels[0] );
+            if (preprocess_node)
+            {
+                uint32_t index = 0;
+                /* Pass parameters to node. */
+                preprocess_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[0]->t,  shapes[2], rs_out_dim );
+                preprocess_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
+                preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+                preprocess_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res );
+                status = vsi_nn_kernel_node_pass_param( preprocess_node, preprocess_params,
+                            _SCATTER_ND_UPDATE_PREPROCESS_PARAM_NUM );
+                CHECK_STATUS(status);
+                vsi_nn_kernel_tensor_release( &preprocess_params[0] );
+                vsi_nn_kernel_scalar_release( &preprocess_params[2] );
+                vsi_nn_kernel_scalar_release( &preprocess_params[3] );
+            }
+
+            // update
+            process_node = vsi_nn_kernel_create_node( graph, ikernels[1] );
+            if (process_node)
+            {
+                uint32_t index = 0;
+                /* Pass parameters to node. */
+                process_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[1]->t,  shapes[0], rs_idx_dim );
+                process_params[index++] = vsi_nn_kernel_tensor_reshape( inputs[2]->t,  shapes[1], rs_in_dim );
+                process_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
+                process_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[0] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[1] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[2] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[3] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[4] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[5] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &strides[6] );
+                process_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &coord_dim );
+                status = vsi_nn_kernel_node_pass_param( process_node, process_params,
+                                _SCATTER_ND_UPDATE_PROCESS_PARAM_NUM );
+                CHECK_STATUS(status);
+                vsi_nn_kernel_tensor_release( &process_params[0] );
+                vsi_nn_kernel_tensor_release( &process_params[1] );
+                vsi_nn_kernel_scalar_release( &process_params[4] );
+                vsi_nn_kernel_scalar_release( &process_params[5] );
+                vsi_nn_kernel_scalar_release( &process_params[6] );
+                vsi_nn_kernel_scalar_release( &process_params[7] );
+                vsi_nn_kernel_scalar_release( &process_params[8] );
+                vsi_nn_kernel_scalar_release( &process_params[9] );
+                vsi_nn_kernel_scalar_release( &process_params[10] );
+                vsi_nn_kernel_scalar_release( &process_params[11] );
+            }
+
+            // convert float to output
+            node = vsi_nn_kernel_create_node( graph, kernel );
+            if ( node )
+            {
+                uint32_t index = 0;
+                /* Pass parameters to node. */
+                conv_params[index++] = (vsi_nn_kernel_node_param_t)tensors[0]->t;
+                conv_params[index++] = (vsi_nn_kernel_node_param_t)tensors[1]->t;
+                conv_params[index++] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[2], rs_out_dim );
+                conv_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+                conv_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &res );
+                status = vsi_nn_kernel_node_pass_param( node, conv_params, _SCATTER_ND_UPDATE_CONV_PARAM_NUM );
+                CHECK_STATUS(status);
+                vsi_nn_kernel_tensor_release( &conv_params[2] );
+                vsi_nn_kernel_scalar_release( &conv_params[3] );
+                vsi_nn_kernel_scalar_release( &conv_params[4] );
+            }
+        }
+
+        if (preprocess_node) {vsi_nn_kernel_node_release( &preprocess_node );}
+        if (process_node) {vsi_nn_kernel_node_release( &process_node );}
+    }
+
+final:
+    if (ikernels[0])
+    {
+        vsi_nn_kernel_release(&ikernels[0]);
+    }
+    if (ikernels[1])
+    {
+        vsi_nn_kernel_release(&ikernels[1]);
+    }
+    vsi_safe_release_tensor(tensors[0]);
+    vsi_safe_release_tensor(tensors[1]);
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( scatter_nd_update_reduction, _setup )
diff --git a/src/tim/vx/internal/src/kernel/evis/select_evis.c b/src/tim/vx/internal/src/kernel/evis/select_evis.c
index b918e2c..4d8e90b 100644
--- a/src/tim/vx/internal/src/kernel/evis/select_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/select_evis.c
@@ -22,6 +22,7 @@
 *
 *****************************************************************************/
 
+#if !(VX_TENSOR_SELECT_VX_SUPPORT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -159,7 +160,6 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
     vsi_nn_kernel_tensor_attr_t *input1_attr   = NULL;
     vsi_nn_kernel_tensor_attr_t *output_attr   = NULL;
     vsi_size_array_t             *output_shape  = NULL;
-    int32_t  input0_fl = 0, input1_fl = 0, output_fl = 0;
     float    input0Scale                    = 1.0f;
     int32_t  input0Zp                       = 0;
     float    input1Scale                    = 1.0f;
@@ -180,59 +180,12 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
     output_attr  = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output);
     CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
 
-    if ( input0_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        input0_fl = input0_attr->dfp.fl;
-        if (input0_fl > 0)
-        {
-            input0Scale = 1.0f / (float) ((int64_t)1 << input0_fl);
-        }
-        else
-        {
-            input0Scale = (float)((int64_t)1 << -input0_fl);
-        }
-    }
-    else if ( input0_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        input0Scale = input0_attr->asymm.scale;
-        input0Zp    = input0_attr->asymm.zero_point;
-    }
-
-    if ( input1_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        input1_fl = input1_attr->dfp.fl;
-        if (input1_fl > 0)
-        {
-            input1Scale = 1.0f / (float) ((int64_t)1 << input1_fl);
-        }
-        else
-        {
-            input1Scale = (float)((int64_t)1 << -input1_fl);
-        }
-    }
-    else if ( input1_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        input1Scale = input1_attr->asymm.scale;
-        input1Zp    = input1_attr->asymm.zero_point;
-    }
-
-    if ( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        output_fl = output_attr->dfp.fl;
-        if (output_fl > 0)
-        {
-            outputScale = 1.0f / (float) ((int64_t)1 << output_fl);
-        }
-        else
-        {
-            outputScale = (float)((int64_t)1 << -output_fl);
-        }
-    }
-    else if ( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        outputScale = output_attr->asymm.scale;
-        outputZP    = output_attr->asymm.zero_point;
-    }
+    input0Scale = input0_attr->scale;
+    input0Zp    = input0_attr->zero_point;
+    input1Scale = input1_attr->scale;
+    input1Zp    = input1_attr->zero_point;
+    outputScale = output_attr->scale;
+    outputZP    = output_attr->zero_point;
 
     gpu_quantize_multiplier_16bit(input0Scale / outputScale, &in0_M0, &in0_postShift);
     gpu_quantize_multiplier_16bit(input1Scale / outputScale, &in1_M0, &in1_postShift);
@@ -541,3 +494,4 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( select, _setup )
+#endif
diff --git a/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c b/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c
index b2e22ed..dde408d 100644
--- a/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c
@@ -131,42 +131,10 @@ DEF_KERNEL_INITIALIZER(_sequence_mask_initializer)
     CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
 
     out_shape  = attr[1]->shape;
-
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        input_zp     = attr[0]->asymm.zero_point;
-        scaleIn      = attr[0]->asymm.scale;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        input_zp = 0;
-    }
-
-    if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        output_zp    = attr[1]->asymm.zero_point;
-        scaleOut     = 1.0f / attr[1]->asymm.scale;
-    }
-    else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[1]->dfp.fl > 0)
-        {
-            scaleOut = (float)((int64_t)1 << attr[1]->dfp.fl);
-        }
-        else
-        {
-            scaleOut = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl));
-        }
-        output_zp = 0;
-    }
+    input_zp   = attr[0]->zero_point;
+    scaleIn    = attr[0]->scale;
+    output_zp  = attr[1]->zero_point;
+    scaleOut   = 1.0f / attr[1]->scale;
 
     outputVal1 = scaleOut + (float)output_zp;
 
diff --git a/src/tim/vx/internal/src/kernel/evis/slice_evis.c b/src/tim/vx/internal/src/kernel/evis/slice_evis.c
index 773d38b..2362e2c 100644
--- a/src/tim/vx/internal/src/kernel/evis/slice_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/slice_evis.c
@@ -157,8 +157,6 @@ DEF_KERNEL_INITIALIZER(_slice_initializer)
     float     scaleOut        = 1.0f;
     int32_t   output_ZP       = 0;
     int32_t   input_ZP        = 0;
-    int32_t   srcFixPointPos  = 0;
-    int32_t   dstFixPointPos  = 0;
     int32_t   is_samefl       = 0;
     uint32_t  pack_key        = 0;
 
@@ -178,41 +176,10 @@ DEF_KERNEL_INITIALIZER(_slice_initializer)
 
     pack_key = _PACK_SLICE_KEY( input_dtype, output_dtype, is_samefl);
 
-    if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
-    {
-        srcFixPointPos   = input_attr->dfp.fl;
-        if (srcFixPointPos > 0)
-        {
-            scaleIn = (1.0f / ((float) ((int64_t)1 << srcFixPointPos)));
-        }
-        else
-        {
-            scaleIn = ((float) ((int64_t)1 << -srcFixPointPos));
-        }
-    }
-    else if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant)
-    {
-        input_ZP         = input_attr->asymm.zero_point;
-        scaleIn          = input_attr->asymm.scale;
-    }
-
-    if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
-    {
-        dstFixPointPos   = output_attr->dfp.fl;
-        if (dstFixPointPos > 0)
-        {
-            scaleOut = (1.0f / ((float) ((int64_t)1 << dstFixPointPos)));
-        }
-        else
-        {
-            scaleOut = ((float) ((int64_t)1 << -dstFixPointPos));
-        }
-    }
-    else if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant)
-    {
-        output_ZP        = output_attr->asymm.zero_point;
-        scaleOut         = output_attr->asymm.scale;
-    }
+    input_ZP         = input_attr->zero_point;
+    scaleIn          = input_attr->scale;
+    output_ZP        = output_attr->zero_point;
+    scaleOut         = output_attr->scale;
 
     if ((I8 == input_dtype && input_dtype == output_dtype ) ||
         (U8 == input_dtype && input_dtype == output_dtype ) )
diff --git a/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c b/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c
index f31de54..eb38746 100644
--- a/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c
@@ -170,23 +170,8 @@ DEF_KERNEL_INITIALIZER(_get_matrix_initializer)
     attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final );
 
-    if ( attr->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr->dfp.fl;
-        if (fl > 0)
-        {
-            input_scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input_scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        input_scale  = attr->asymm.scale;
-        input_tail = 0 - attr->asymm.zero_point * input_scale;
-    }
+    input_scale  = attr->scale;
+    input_tail = 0 - attr->zero_point * input_scale;
 
     in_shape  = attr->shape;
 
@@ -265,42 +250,10 @@ DEF_KERNEL_INITIALIZER(_warp_affine_initializer)
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
     CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
 
-    if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP )
-    {
-        int32_t fl = attr[0]->dfp.fl;
-        if (fl > 0)
-        {
-            input_scale = 1.0f / (float) ((int64_t)1 << fl);
-        }
-        else
-        {
-            input_scale = (float)((int64_t)1 << -fl);
-        }
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        input_scale  = attr[0]->asymm.scale;
-        input_tail = 0 - attr[0]->asymm.zero_point * input_scale;
-    }
-
-    if (attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        int32_t fl = attr[1]->dfp.fl;
-
-        if (fl >= 0)
-        {
-            output_scale = (vx_float32) ((vx_int64)1 << fl);
-        }
-        else if (fl < 0)
-        {
-            output_scale = 1.0f / (vx_float32) ((vx_int64)1 << -fl);
-        }
-    }
-    else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        output_scale   = 1.0f / attr[1]->asymm.scale;
-        output_zp = (float)attr[1]->asymm.zero_point;
-    }
+    input_scale  = attr[0]->scale;
+    input_tail   = 0 - attr[0]->zero_point * input_scale;
+    output_scale = 1.0f / attr[1]->scale;
+    output_zp    = (float)attr[1]->zero_point;
 
     out_shape  = attr[1]->shape;
 
diff --git a/src/tim/vx/internal/src/kernel/evis/swish_evis.c b/src/tim/vx/internal/src/kernel/evis/swish_evis.c
index befe6ac..f1ad40b 100644
--- a/src/tim/vx/internal/src/kernel/evis/swish_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/swish_evis.c
@@ -166,8 +166,6 @@ DEF_KERNEL_INITIALIZER(_swish_initializer)
 
     vx_tensor     input            = (vx_tensor)param[0];
     vx_tensor     output           = (vx_tensor)param[1];
-    int8_t        srcFixPointPos   = 0;
-    int8_t        dstFixPointPos   = 0;
     vx_float32    inputTail        = 0;
     vx_float32    inputScale       = 1.0f;
     vx_float32    outputZP         = 0;
@@ -186,42 +184,11 @@ DEF_KERNEL_INITIALIZER(_swish_initializer)
     CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
 
     out_shape = output_attr->shape;
+    inputScale  = input_attr->scale;
+    inputTail   = 0 - (vx_float32)input_attr->zero_point * inputScale;
+    outputScale  = 1.0f / output_attr->scale;
+    outputZP     = (vx_float32)(output_attr->zero_point);
 
-    if (input_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        srcFixPointPos = (int8_t)input_attr->dfp.fl;
-        if (srcFixPointPos > 0)
-        {
-            inputScale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
-        }
-        else
-        {
-            inputScale = (vx_float32)((int64_t)1 << -srcFixPointPos);
-        }
-    }
-    else if (input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || input_attr->quant == VSI_NN_KERNEL_QUANT_SYMM)
-    {
-        inputScale  = input_attr->asymm.scale;
-        inputTail   = 0 - input_attr->asymm.zero_point * inputScale;
-    }
-
-    if (output_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        dstFixPointPos = (int8_t)output_attr->dfp.fl;
-        if (dstFixPointPos > 0)
-        {
-            outputScale = (vx_float32) ((int64_t)1 << dstFixPointPos);
-        }
-        else
-        {
-            outputScale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos);
-        }
-    }
-    else if (output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || output_attr->quant == VSI_NN_KERNEL_QUANT_SYMM)
-    {
-        outputScale  = 1.0f / output_attr->asymm.scale;
-        outputZP     = (vx_float32)(output_attr->asymm.zero_point);
-    }
 #define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE )    \
         (IN_TYPE | ( OUT_TYPE << 16))
 
@@ -379,8 +346,6 @@ DEF_KERNEL_INITIALIZER(_hswish_initializer)
 
     vx_tensor     input            = (vx_tensor)param[0];
     vx_tensor     output           = (vx_tensor)param[1];
-    int8_t        srcFixPointPos   = 0;
-    int8_t        dstFixPointPos   = 0;
     vx_float32    inputTail        = 0;
     vx_float32    inputScale       = 1.0f;
     vx_float32    outputZP         = 0;
@@ -398,42 +363,11 @@ DEF_KERNEL_INITIALIZER(_hswish_initializer)
     CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
 
     out_shape = output_attr->shape;
+    inputScale  = input_attr->scale;
+    inputTail   = 0 - (vx_float32)input_attr->zero_point * inputScale;
+    outputScale  = 1.0f / output_attr->scale;
+    outputZP     = (vx_float32)(output_attr->zero_point);
 
-    if (input_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        srcFixPointPos = (int8_t)input_attr->dfp.fl;
-        if (srcFixPointPos > 0)
-        {
-            inputScale = 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
-        }
-        else
-        {
-            inputScale = (vx_float32)((int64_t)1 << -srcFixPointPos);
-        }
-    }
-    else if (input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || input_attr->quant == VSI_NN_KERNEL_QUANT_SYMM)
-    {
-        inputScale  = input_attr->asymm.scale;
-        inputTail   = 0 - input_attr->asymm.zero_point * inputScale;
-    }
-
-    if (output_attr->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        dstFixPointPos = (int8_t)output_attr->dfp.fl;
-        if (dstFixPointPos > 0)
-        {
-            outputScale = (vx_float32) ((int64_t)1 << dstFixPointPos);
-        }
-        else
-        {
-            outputScale = 1.0f / (vx_float32) ((int64_t)1 << -dstFixPointPos);
-        }
-    }
-    else if (output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM || output_attr->quant == VSI_NN_KERNEL_QUANT_SYMM)
-    {
-        outputScale  = 1.0f / output_attr->asymm.scale;
-        outputZP     = (vx_float32)(output_attr->asymm.zero_point);
-    }
 #define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE )    \
         (IN_TYPE | ( OUT_TYPE << 16))
 
diff --git a/src/tim/vx/internal/src/kernel/evis/tile_evis.c b/src/tim/vx/internal/src/kernel/evis/tile_evis.c
index f46941a..4fc76f9 100644
--- a/src/tim/vx/internal/src/kernel/evis/tile_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/tile_evis.c
@@ -22,6 +22,7 @@
 *
 *****************************************************************************/
 
+#if !(VX_TENSOR_TILE_API_SUPPORT)
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -280,42 +281,10 @@ DEF_KERNEL_INITIALIZER(_tile_initializer)
     CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
 
     in_shape  = attr[0]->shape;
-
-    if (VSI_NN_KERNEL_QUANT_DFP == attr[0]->quant)
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            scaleIn = (1.0f / ((float) ((int64_t)1 << attr[0]->dfp.fl)));
-        }
-        else
-        {
-            scaleIn = ((float) ((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        input_ZP = 0;
-    }
-    else if (VSI_NN_KERNEL_QUANT_ASYMM == attr[0]->quant)
-    {
-        input_ZP         = attr[0]->asymm.zero_point;
-        scaleIn          = attr[0]->asymm.scale;
-    }
-
-    if (VSI_NN_KERNEL_QUANT_DFP == attr[1]->quant)
-    {
-        if (attr[1]->dfp.fl > 0)
-        {
-            scaleOut = (float)((int64_t)1 << attr[1]->dfp.fl);
-        }
-        else
-        {
-            scaleOut = (1.0f / (float)((int64_t)1 << -attr[1]->dfp.fl));
-        }
-        output_ZP = 0;
-    }
-    else if (VSI_NN_KERNEL_QUANT_ASYMM == attr[1]->quant)
-    {
-        output_ZP        = attr[1]->asymm.zero_point;
-        scaleOut         = attr[1]->asymm.scale;
-    }
+    input_ZP  = attr[0]->zero_point;
+    scaleIn   = attr[0]->scale;
+    output_ZP = attr[1]->zero_point;
+    scaleOut  = attr[1]->scale;
 
 #define _PACK_SELECT_KEY( IN_TYPE, OUT_TYPE )    \
         (( IN_TYPE << 16) | ( OUT_TYPE))
@@ -626,3 +595,4 @@ final:
 REGISTER_BACKEND_EVIS( tile, _setup )
 
 __END_DECLS
+#endif
diff --git a/src/tim/vx/internal/src/kernel/evis/upsample_evis.c b/src/tim/vx/internal/src/kernel/evis/upsample_evis.c
index fb78c49..e2327e3 100644
--- a/src/tim/vx/internal/src/kernel/evis/upsample_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/upsample_evis.c
@@ -174,41 +174,19 @@ DEF_KERNEL_INITIALIZER(_upsample_initializer)
     src_dtype     = input_attr->dtype;
     dst_dtype     = output_attr->dtype;
     axis_dtype    = axis_attr->dtype;
+    inputScale    = input_attr->scale;
+    input_ZP      = input_attr->zero_point;
+    outputScale   = output_attr->scale;
+    output_ZP     = output_attr->zero_point;
 
     if( input_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
         input_fl = input_attr->dfp.fl;
-        if (input_fl > 0)
-        {
-            inputScale = 1.0f / (float) ((int64_t)1 << input_fl);
-        }
-        else
-        {
-            inputScale = (float)((int64_t)1 << -input_fl);
-        }
-    }
-    else if( input_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        inputScale  = input_attr->asymm.scale;
-        input_ZP    = input_attr->asymm.zero_point;
     }
 
     if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
         output_fl = output_attr->dfp.fl;
-        if (output_fl > 0)
-        {
-            outputScale = 1.0f / (float) ((int64_t)1 << output_fl);
-        }
-        else
-        {
-            outputScale = (float)((int64_t)1 << -output_fl);
-        }
-    }
-    else if( output_attr->quant == VSI_NN_KERNEL_QUANT_ASYMM )
-    {
-        outputScale  = output_attr->asymm.scale;
-        output_ZP    = output_attr->asymm.zero_point;
     }
 
     factorOut = 1.0f / outputScale;
diff --git a/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c b/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c
index 6bc113f..936590c 100644
--- a/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c
@@ -147,8 +147,6 @@ DEF_KERNEL_INITIALIZER(_upsamplescale_initializer)
     float     scaleOut        = 1.0f;
     int32_t   output_ZP       = 0;
     int32_t   input_ZP        = 0;
-    int32_t   srcFixPointPos  = 0;
-    int32_t   dstFixPointPos  = 0;
     uint32_t  pack_key        = 0;
     _internal_upscale_e flag  = UP_ORG;
 
@@ -164,34 +162,10 @@ DEF_KERNEL_INITIALIZER(_upsamplescale_initializer)
     vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[SCALAR_SCALE_VALUE], &(scale));
     input_dtype  = input_attr->dtype;
     output_dtype = output_attr->dtype;
-
-    if (VSI_NN_KERNEL_QUANT_DFP == input_attr->quant)
-    {
-        srcFixPointPos   = input_attr->dfp.fl;
-        if (srcFixPointPos >=0 )
-            scaleIn = 1.0f / (float) ((int64_t)1 << srcFixPointPos);
-        else
-            scaleIn = (float) ((int64_t)1 << -srcFixPointPos);
-    }
-    else if (VSI_NN_KERNEL_QUANT_ASYMM == input_attr->quant)
-    {
-        input_ZP         = input_attr->asymm.zero_point;
-        scaleIn          = input_attr->asymm.scale;
-    }
-
-    if (VSI_NN_KERNEL_QUANT_DFP == output_attr->quant)
-    {
-        dstFixPointPos   = output_attr->dfp.fl;
-        if (dstFixPointPos >=0 )
-            scaleOut = 1.0f / (float) ((int64_t)1 << dstFixPointPos);
-        else
-            scaleOut = (float) ((int64_t)1 << -dstFixPointPos);
-    }
-    else if (VSI_NN_KERNEL_QUANT_ASYMM == output_attr->quant)
-    {
-        output_ZP        = output_attr->asymm.zero_point;
-        scaleOut         = output_attr->asymm.scale;
-    }
+    input_ZP     = input_attr->zero_point;
+    scaleIn      = input_attr->scale;
+    output_ZP    = output_attr->zero_point;
+    scaleOut     = output_attr->scale;
 
     if (stride == 2 && scale >= 0)
     {
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
index 547254f..8fefcb9 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
@@ -564,6 +564,11 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
                 temp_shape_y[temp_rank] = temp_shape_y[i];
                 temp_shape_output[temp_rank++] = temp_shape_output[i];
             }
+            //Delete 1to1 dim
+            if (temp_rank != 1 && temp_shape_output[temp_rank - 1] == 1)
+            {
+                temp_rank --;
+            }
         }
         else if (temp_shape_x[i] != 1)
         {
@@ -578,8 +583,12 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
                    sy *= temp_shape_y[j];
                    sz *= temp_shape_output[j];
                }
-               temp_rank += tile_fill_dim( temp_shape_x, temp_shape_y, temp_shape_output,
-                       temp_rank, VSI_NN_MAX_DIM_NUM, sx, sy, sz );
+              //Delete 1to1 dim
+               if (sz != 1)
+               {
+                   temp_rank += tile_fill_dim( temp_shape_x, temp_shape_y, temp_shape_output,
+                           temp_rank, VSI_NN_MAX_DIM_NUM, sx, sy, sz );
+               }
                idx_start = -1;
             }
             temp_shape_x[temp_rank] = temp_shape_x[i];
@@ -601,10 +610,6 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape
          * Skip dim if the size is equal to 1
          * Also skip if ( sx == 1 && sy == 1 )
          */
-        if ( temp_shape_output[i] == 1 )
-        {
-            continue;
-        }
 
         // Update state
         state = TILE_STATE_EMPTY;
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c
index b9f3ff2..8dcae3c 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c
@@ -250,6 +250,11 @@ static float inverse_sigmoid_eval(float x, vsi_nn_kernel_lut_params *lut_param)
     return log_eval(x1 / x2);
 }
 
+static float tan_eval(float x)
+{
+    return tanf(x);
+}
+
 static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *lut_param)
 {
     float result = 0;
@@ -325,6 +330,9 @@ static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *
     case VSI_NN_KERNEL_LUT_INVERSE_SIGMOID:
         result = inverse_sigmoid_eval(data, lut_param);
         break;
+    case VSI_NN_KERNEL_LUT_TAN:
+        result = tan_eval(data);
+        break;
     default:
         VSILOGE( "unsupported activation function:%d", lut_param->act_type );
         break;
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
index 974ad58..b837e66 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
@@ -132,7 +132,6 @@ static vsi_status _select
     } \
     REGISTER_KERNEL_SELECTOR( kernel_name, _##kernel_name##_kernel_selector )
 
-REGISTER_VX_FIRST_KERNEL_SELECTOR(exp)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(log)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(selu)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(neg)
@@ -153,6 +152,7 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(atan)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(atanh)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(acosh)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(inverse_sigmoid)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(tan)
 #if (VX_TENSOR_SELECT_VX_SUPPORT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(select)
 #endif
@@ -168,5 +168,19 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(relational_ops)
 #if (VX_TENSOR_TILE_API_SUPPORT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(tile)
 #endif
+#if (VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(layer_norm)
+#endif
+#if (VX_ACTIVATION_EXP_VX_SUPPORT_EXT)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(exp)
+#endif
+#if (VX_ACTIVATION_SIN_COS_VX_SUPPORT_EXT)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(sin)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(cos)
+#endif
+#if (VX_LOGSOFTMAX_VX_SUPPORT)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(log_softmax)
+#endif
+
 
 __END_DECLS
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
index 55a6100..d74d6a1 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
@@ -811,7 +811,7 @@ vsi_nn_tensor_t* vsi_nn_merge_input_zeropoint_to_bias
             attr.dim_num  = 2;
         }
         bias_data = (int32_t *)vsi_nn_ConvertTensorToData(graph, bias);
-        CHECK_PTR_FAIL_GOTO( new_bias_data_ptr, "Create buffer fail.", final );
+        CHECK_PTR_FAIL_GOTO( bias_data, "ConvertTensorToData fail.", final );
     }
 
     new_bias_data_ptr = (int32_t *)malloc(attr.size[0] * sizeof(int32_t));
@@ -869,3 +869,66 @@ vsi_status vsi_nn_set_sp_kernel_name
     return status;
 }
 
+vsi_nn_tensor_t * vsi_nn_kernel_insert_reshape_node
+    (
+        vsi_nn_graph_t    * graph,
+        vsi_nn_tensor_t   * in_tensor,
+        vsi_size_t        * shape,
+        uint32_t            dim_num,
+        vsi_nn_opt_direction_e direction
+    )
+{
+    vsi_nn_tensor_t * tensor = NULL;
+#if VX_REMOVE_RESHAPE_SUPPORT
+    vsi_nn_tensor_attr_t attr;
+    vsi_nn_tensor_t *dims_tensor = NULL;
+    vx_nn_reshape_params_t reshape_param;
+    int32_t dims_data[VSI_NN_MAX_DIM_NUM] = {1};
+    uint32_t i = 0;
+    vsi_nn_tensor_t * input = NULL;
+    vsi_nn_tensor_t * output = NULL;
+
+    memcpy( &attr, &(in_tensor->attr), sizeof(vsi_nn_tensor_attr_t) );
+    memcpy( attr.size, shape, sizeof(vsi_size_t) * dim_num);
+    attr.dim_num = dim_num;
+    tensor = vsi_nn_CreateTensor( graph, &attr );
+    CHECK_PTR_FAIL_GOTO(tensor, "Create tensor failed", final);
+
+    for (i = 0; i < dim_num; i++)
+    {
+        dims_data[i] = (int32_t)shape[i];
+    }
+
+    memset(&attr, 0, sizeof(attr));
+    attr.size[0] = dim_num;
+    attr.dim_num = 1;
+    attr.is_const = TRUE;
+    attr.dtype.vx_type = VSI_NN_TYPE_INT32;
+    attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+    dims_tensor = vsi_nn_CreateTensorFromData(
+        graph, (uint8_t *)dims_data, &attr);
+    CHECK_PTR_FAIL_GOTO(dims_tensor, "Create tensor failed", final);
+    reshape_param.dims = REQUIRED_IO(dims_tensor);
+
+    if (direction == VSI_NN_OPTIMIZE_BACKWARD)
+    {
+        input = in_tensor;
+        output = tensor;
+    }
+    else
+    {
+        input = tensor;
+        output = in_tensor;
+    }
+
+    vxTensorReshapeNode(graph->g, input->t, &reshape_param, sizeof(reshape_param), output->t);
+    vsi_safe_release_tensor(dims_tensor);
+#else
+    VSI_UNREFERENCED(direction);
+    tensor = vsi_nn_reshape_tensor( graph, in_tensor, shape, dim_num );
+    CHECK_PTR_FAIL_GOTO(tensor, "Reshape tensor failed", final);
+#endif
+
+final:
+    return tensor;
+}
diff --git a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
index 09514d3..1b53660 100644
--- a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
@@ -154,13 +154,14 @@ final:
     REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup )
 
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( mish,         VSI_NN_KERNEL_LUT_MISH )
-//REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( exp,          VSI_NN_KERNEL_LUT_EXP )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( log,          VSI_NN_KERNEL_LUT_LOG )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( selu,         VSI_NN_KERNEL_LUT_SELU )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( neg,          VSI_NN_KERNEL_LUT_NEG )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( hard_sigmoid, VSI_NN_KERNEL_LUT_HSIGMOID )
+#if !(VX_ACTIVATION_GELU_VX_SUPPORT_EXT)
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( gelu,         VSI_NN_KERNEL_LUT_GELU )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( hard_gelu,    VSI_NN_KERNEL_LUT_HGELU )
+#endif
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( erf,          VSI_NN_KERNEL_LUT_ERF )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( relu_keras,   VSI_NN_KERNEL_LUT_RELU_KERAS )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( clip,         VSI_NN_KERNEL_LUT_CLIP )
@@ -168,6 +169,7 @@ REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( celu,         VSI_NN_KERNEL_LUT_CELU )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( rcp,          VSI_NN_KERNEL_LUT_RCP )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( softsign,     VSI_NN_KERNEL_LUT_SOFTSIGN )
 REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( atan,         VSI_NN_KERNEL_LUT_ATAN )
+REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( tan,          VSI_NN_KERNEL_LUT_TAN )
 
 #undef REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL
 
@@ -412,4 +414,115 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( softrelu )
     return (vsi_nn_kernel_node_t)node;
 } /* softrelu() */
 
+#if (VX_ACTIVATION_EXP_VX_SUPPORT_EXT)
+REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( exp )
+{
+    vx_node node = NULL;
+
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
+    node = vxActivationLayer(
+        graph->g,
+        inputs[0]->t,
+        VX_CONVOLUTIONAL_NETWORK_ACTIVATION_EXP,
+        0,
+        0,
+        outputs[0]->t
+        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* exp() */
+#endif
+
+#if (VX_ACTIVATION_SIN_COS_VX_SUPPORT_EXT)
+REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( sin )
+{
+    vx_node node = NULL;
+
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
+    node = vxActivationLayer(
+        graph->g,
+        inputs[0]->t,
+        VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SIN,
+        0,
+        0,
+        outputs[0]->t
+        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* sin() */
+
+REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( cos )
+{
+    vx_node node = NULL;
+
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
+    node = vxActivationLayer(
+        graph->g,
+        inputs[0]->t,
+        VX_CONVOLUTIONAL_NETWORK_ACTIVATION_COS,
+        0,
+        0,
+        outputs[0]->t
+        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* cos() */
+#endif
+
+#if (VX_ACTIVATION_GELU_VX_SUPPORT_EXT)
+REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( gelu )
+{
+    vx_node node = NULL;
+
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
+    node = vxActivationLayer(
+        graph->g,
+        inputs[0]->t,
+        VX_CONVOLUTIONAL_NETWORK_ACTIVATION_GELU,
+        1,
+        0,
+        outputs[0]->t
+        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* gelu() */
+
+REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( hard_gelu )
+{
+    vx_node node = NULL;
+
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
+    node = vxActivationLayer(
+        graph->g,
+        inputs[0]->t,
+        VX_CONVOLUTIONAL_NETWORK_ACTIVATION_HGELU,
+        1,
+        0,
+        outputs[0]->t
+        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* hard_gelu() */
+#endif
+
 #undef REGISTER_ELTWISE_UNARY_OPENVX_KERNEL
diff --git a/src/tim/vx/internal/src/kernel/vx/layer_norm_vx.c b/src/tim/vx/internal/src/kernel/vx/layer_norm_vx.c
new file mode 100644
index 0000000..00a2def
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/vx/layer_norm_vx.c
@@ -0,0 +1,87 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+#if (VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
+#define REGISTER_LAYER_NORM_OPENVX_KERNEL( kernel_name )   \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ); \
+    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        )
+
+REGISTER_LAYER_NORM_OPENVX_KERNEL( layer_norm )
+{
+    vx_node node = NULL;
+    float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
+    int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
+
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(output_num);
+
+    vx_tensor inputs_tensor[3] = {NULL};
+    vx_tensor output_tensor = NULL;
+
+    inputs_tensor[0] = inputs[0]->t;
+    inputs_tensor[1] = inputs[1]->t;
+    inputs_tensor[2] = inputs[2]->t;
+    output_tensor = outputs[0]->t;
+
+    node = vxLayerNormalizationLayer(
+        graph->g,
+        eps,
+        axis,
+        inputs_tensor,
+        (uint32_t)input_num,
+        output_tensor
+        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* layer_norm() */
+
+#undef REGISTER_LAYER_NORM_OPENVX_KERNEL
+#endif
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/kernel/vx/log_softmax_vx.c b/src/tim/vx/internal/src/kernel/vx/log_softmax_vx.c
new file mode 100644
index 0000000..5b5f447
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/vx/log_softmax_vx.c
@@ -0,0 +1,85 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+#if (VX_LOGSOFTMAX_VX_SUPPORT)
+#define REGISTER_LOGSOFTMAX_OPENVX_KERNEL( kernel_name )   \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ); \
+    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        )
+
+REGISTER_LOGSOFTMAX_OPENVX_KERNEL( log_softmax )
+{
+    vx_node node = NULL;
+    float beta = vsi_nn_kernel_param_get_float32( params, "beta" );
+    int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
+
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
+    vx_tensor input_tensor = NULL;
+    vx_tensor output_tensor = NULL;
+
+    input_tensor = inputs[0]->t;
+    output_tensor = outputs[0]->t;
+
+    node = vxLogSoftMaxLayer(
+        graph->g,
+        input_tensor,
+        beta,
+        axis,
+        output_tensor
+        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* logsoftmax() */
+
+#undef REGISTER_LOGSOFTMAX_OPENVX_KERNEL
+#endif
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/crop_and_resize_bilinear.cl b/src/tim/vx/internal/src/libnnext/ops/cl/crop_and_resize_bilinear.cl
new file mode 100644
index 0000000..7f8b1bf
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/crop_and_resize_bilinear.cl
@@ -0,0 +1,107 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+#include "cl_viv_vx_ext.h"
+
+
+_viv_uniform float width_scale;
+_viv_uniform float height_scale;
+_viv_uniform int   image_width;
+_viv_uniform int   image_height;
+
+#define CROP_AND_RESIZE_BILINEAR(name, read_type, dst_type, conv_type, write_type) \
+__kernel void crop_and_resize_bilinear_##name \
+( \
+    __read_only image2d_array_t   input, \
+    __read_only image2d_t         boxes, \
+    __read_only image2d_t         box_ind, \
+    __write_only image2d_array_t  output, \
+                 uint             ori_depth, \
+                 uint             ori_batchout, \
+                 float            inOutScale, \
+                 float            inOutTile, \
+                 float            extrapolation_value \
+) \
+{ \
+    int bb = get_global_id(2); \
+    int y =  get_global_id(1); \
+    int x = get_global_id(0); \
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    int2 coord_box_ind = (int2)(bb, 0); \
+    int b = read_imagei(box_ind, coord_box_ind).x; \
+    float4 xy; \
+    float in_x, in_y; \
+    int d = 0; \
+ \
+    Image img_boxes = create_image_from_image2d(boxes, 2); \
+    __global half* boxes_ptr = (__global half*)img_boxes.ptr; \
+    xy = vload_half4(bb, boxes_ptr); \
+    float _width_scale = convert_float(xy.w - xy.y) * width_scale; \
+    float _height_scale = convert_float(xy.z - xy.x) * height_scale; \
+    if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \
+    if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \
+    in_y = xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale; \
+    in_x = xy.y * convert_float(image_width - 1) + convert_float(x) * _width_scale; \
+    float y_lerp = in_y - floor(in_y); \
+    float x_lerp = in_x - floor(in_x); \
+    float4 src0, src1, src2, src3; \
+    for (d = 0; d < ori_depth; d++) \
+    { \
+        int4 coord = (int4)(floor(in_x), floor(in_y), d + b * ori_depth, 0); \
+        if (coord.x < 0 || coord.x > image_width - 1 || coord.y < 0 || coord.y > image_height - 1) \
+        { \
+            src0 = (float4)(extrapolation_value,0,0,0); \
+        } \
+        else \
+        { \
+            src0 = convert_float4(read_type(input, coord)); \
+        } \
+        coord.x = coord.x + 1; \
+        if (coord.x < 0 || coord.x > image_width - 1 || coord.y < 0 || coord.y > image_height - 1) \
+        { \
+            src1 = (float4)(extrapolation_value,0,0,0); \
+        } \
+        else \
+        { \
+            src1 = convert_float4(read_type(input, coord)); \
+        } \
+        coord.y = coord.y + 1; \
+        if (coord.x < 0 || coord.x > image_width - 1 || coord.y < 0 || coord.y > image_height - 1) \
+        { \
+            src3 = (float4)(extrapolation_value,0,0,0); \
+        } \
+        else \
+        { \
+            src3 = convert_float4(read_type(input, coord)); \
+        } \
+        coord.x = coord.x - 1; \
+        if (coord.x < 0 || coord.x > image_width - 1 || coord.y < 0 || coord.y > image_height - 1) \
+        { \
+            src2 = (float4)(extrapolation_value,0,0,0); \
+        } \
+        else \
+        { \
+            src2 = convert_float4(read_type(input, coord)); \
+        } \
+        float4 top = src0 + (src1 - src0) * x_lerp; \
+        float4 bottom = src2 + (src3 - src2) * x_lerp; \
+        float4 value = top + (bottom - top) * y_lerp; \
+        value = value * inOutScale + inOutTile; \
+        dst_type dst = conv_type(value); \
+        coord_out.z = d + coord_out.z * ori_depth; \
+        write_type(output, coord_out, dst); \
+    } \
+}
+
+CROP_AND_RESIZE_BILINEAR(U32toU32,read_imageui, \
+uint4, convert_uint4, write_imageui)
+CROP_AND_RESIZE_BILINEAR(U32toF32,read_imageui, \
+float4,convert_float4,write_imagef)
+CROP_AND_RESIZE_BILINEAR(F32toF32,read_imagef, \
+float4, convert_float4,write_imagef)
+CROP_AND_RESIZE_BILINEAR(F32toU32,read_imagef, \
+uint4,  convert_uint4, write_imageui)
+CROP_AND_RESIZE_BILINEAR(F32toI32,read_imagef, \
+int4,   convert_int4,  write_imagei)
+CROP_AND_RESIZE_BILINEAR(I32toI32,read_imagei,  \
+int4,  convert_int4,  write_imagei)
+CROP_AND_RESIZE_BILINEAR(I32toF32,read_imagei,  \
+float4,convert_float4,write_imagef)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/crop_and_resize_nearest_neighbor.cl b/src/tim/vx/internal/src/libnnext/ops/cl/crop_and_resize_nearest_neighbor.cl
new file mode 100644
index 0000000..e1f93c5
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/crop_and_resize_nearest_neighbor.cl
@@ -0,0 +1,77 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+#include "cl_viv_vx_ext.h"
+
+
+_viv_uniform float width_scale;
+_viv_uniform float height_scale;
+_viv_uniform int   image_width;
+_viv_uniform int   image_height;
+
+#define CROP_AND_RESIZE_NEAREST_NEIGHTBOR(name,src_type, read_type, dst_type, conv_type, write_type) \
+__kernel void crop_and_resize_nearest_neighbor_##name \
+( \
+    __read_only image2d_array_t   input, \
+    __read_only image2d_t         boxes, \
+    __read_only image2d_t         box_ind, \
+    __write_only image2d_array_t  output, \
+                 uint             ori_depth, \
+                 uint             ori_batchout, \
+                 float            inOutScale, \
+                 float            inOutTile, \
+                 float            extrapolation_value \
+) \
+{ \
+    int bb = get_global_id(2); \
+    int y =  get_global_id(1); \
+    int x = get_global_id(0); \
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    int2 coord_box_ind = (int2)(bb, 0); \
+    int b = read_imagei(box_ind, coord_box_ind).x; \
+    float4 xy; \
+    int in_x, in_y, d = 0; \
+ \
+    Image img_boxes = create_image_from_image2d(boxes, 2); \
+    __global half* boxes_ptr = (__global half*)img_boxes.ptr; \
+    xy = vload_half4(bb, boxes_ptr); \
+    float _width_scale = convert_float(xy.w - xy.y) * width_scale; \
+    float _height_scale = convert_float(xy.z - xy.x) * height_scale; \
+    if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \
+    if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \
+    in_y = convert_int(round(xy.x * convert_float(image_height - 1) \
+                                  + convert_float(y) * _height_scale)); \
+    in_x = convert_int(round(xy.y * convert_float(image_width - 1) \
+                                  + convert_float(x) * _width_scale)); \
+    for (d = 0; d < ori_depth; d++) \
+    { \
+        int4 coord = (int4)(in_x, in_y, d + b * ori_depth, 0); \
+        float4 src_f; \
+        if (coord.x < 0 || coord.x > image_width - 1 || coord.y < 0 || coord.y > image_height - 1) \
+        { \
+            src_f = (float4)(extrapolation_value, 0, 0, 0); \
+        } \
+        else \
+        { \
+            src_type src = read_type(input, coord); \
+            src_f = convert_float4(src); \
+        } \
+        src_f = src_f * inOutScale + inOutTile; \
+        dst_type dst = conv_type(src_f); \
+        coord_out.z = d + coord_out.z * ori_depth; \
+        write_type(output, coord_out, dst); \
+    } \
+}
+
+CROP_AND_RESIZE_NEAREST_NEIGHTBOR(U32toU32,uint4, \
+read_imageui, uint4, convert_uint4, write_imageui)
+CROP_AND_RESIZE_NEAREST_NEIGHTBOR(U32toF32,uint4, \
+read_imageui, float4,convert_float4,write_imagef)
+CROP_AND_RESIZE_NEAREST_NEIGHTBOR(F32toF32,float4, \
+read_imagef, float4,convert_float4,write_imagef)
+CROP_AND_RESIZE_NEAREST_NEIGHTBOR(F32toU32,float4, \
+read_imagef, uint4, convert_uint4, write_imageui)
+CROP_AND_RESIZE_NEAREST_NEIGHTBOR(F32toI32,float4, \
+read_imagef, int4,  convert_int4,  write_imagei)
+CROP_AND_RESIZE_NEAREST_NEIGHTBOR(I32toI32,int4,  \
+read_imagei,  int4,  convert_int4,  write_imagei)
+CROP_AND_RESIZE_NEAREST_NEIGHTBOR(I32toF32,int4,  \
+read_imagei,  float4,convert_float4,write_imagef)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/detect_post_box.cl b/src/tim/vx/internal/src/libnnext/ops/cl/detect_post_box.cl
deleted file mode 100644
index 66a7fcb..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/cl/detect_post_box.cl
+++ /dev/null
@@ -1,101 +0,0 @@
-float exp_(float x, float logE)
-{
-    x *= logE;
-    x = exp2(x);
-    return x;
-}
-
-__kernel void detect_post_box_F32_F32toF32(
-     __read_only image2d_array_t   input0,
-           __read_only image2d_t   input1,
-    __write_only image2d_array_t   output,
-                           float   inv_scale_y,
-                           float   inv_scale_x,
-                           float   inv_scale_h,
-                           float   inv_scale_w,
-                           float   logE)
-{
-    int4 coord =  (int4)(0, get_global_id(0), get_global_id(1), 0);
-    float4 src0;
-    float4 src1;
-    float4 dst;
-    float4 tmp0, tmp1;
-    src0.x = read_imagef(input0, coord).x;
-    src1.x = read_imagef(input1, coord.xy).x;
-    coord.x++;
-    src0.y = read_imagef(input0, coord).x;
-    src1.y = read_imagef(input1, coord.xy).x;
-    coord.x++;
-    src0.z = read_imagef(input0, coord).x;
-    src1.z = read_imagef(input1, coord.xy).x;
-    coord.x++;
-    src0.w = read_imagef(input0, coord).x;
-    src1.w = read_imagef(input1, coord.xy).x;
-
-    tmp0.x  = src1.x + src1.z * src0.x * inv_scale_y;
-    tmp0.y  = src1.y + src1.w * src0.y * inv_scale_x;
-    tmp1.x = src1.z * exp_(src0.z * inv_scale_h, logE) * 0.5f;
-    tmp1.y = src1.w * exp_(src0.w * inv_scale_w, logE) * 0.5f;
-    dst.xy = tmp0.xy - tmp1.xy;
-    dst.zw = tmp0.xy + tmp1.xy;
-    coord.x = 0;
-    write_imagef(output, coord, dst.xxxx);
-    coord.x++;
-    write_imagef(output, coord, dst.yyyy);
-    coord.x++;
-    write_imagef(output, coord, dst.zzzz);
-    coord.x++;
-    write_imagef(output, coord, dst.wwww);
-}
-
-
-__kernel void detect_post_box_U8_U8toF32(
-     __read_only image2d_array_t   input0,
-           __read_only image2d_t   input1,
-    __write_only image2d_array_t   output,
-                           float   inv_scale_y,
-                           float   inv_scale_x,
-                           float   inv_scale_h,
-                           float   inv_scale_w,
-                           float   logE,
-                           float   input0Tail,
-                           float   input1Tail,
-                           float   input0Scale,
-                           float   input1Scale)
-{
-    int4 coord =  (int4)(0, get_global_id(0), get_global_id(1), 0);
-    uint4  in0, in1;
-    float4 src0;
-    float4 src1;
-    float4 dst;
-    float4 tmp0, tmp1;
-    in0.x = read_imageui(input0, coord).x;
-    in1.x = read_imageui(input1, coord.xy).x;
-    coord.x++;
-    in0.y = read_imageui(input0, coord).x;
-    in1.y = read_imageui(input1, coord.xy).x;
-    coord.x++;
-    in0.z = read_imageui(input0, coord).x;
-    in1.z = read_imageui(input1, coord.xy).x;
-    coord.x++;
-    in0.w = read_imageui(input0, coord).x;
-    in1.w = read_imageui(input1, coord.xy).x;
-
-    src0 = convert_float4(in0) * input0Scale + input0Tail;
-    src1 = convert_float4(in1) * input1Scale + input1Tail;
-
-    tmp0.x  = src1.x + src1.z * src0.x * inv_scale_y;
-    tmp0.y  = src1.y + src1.w * src0.y * inv_scale_x;
-    tmp1.x = src1.z * exp_(src0.z * inv_scale_h, logE) * 0.5f;
-    tmp1.y = src1.w * exp_(src0.w * inv_scale_w, logE) * 0.5f;
-    dst.xy = tmp0.xy - tmp1.xy;
-    dst.zw = tmp0.xy + tmp1.xy;
-    coord.x = 0;
-    write_imagef(output, coord, dst.xxxx);
-    coord.x++;
-    write_imagef(output, coord, dst.yyyy);
-    coord.x++;
-    write_imagef(output, coord, dst.zzzz);
-    coord.x++;
-    write_imagef(output, coord, dst.wwww);
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_0.cl
index e836a48..d9cc57a 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_0.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_0.cl
@@ -176,6 +176,11 @@ float eltwise_unary_inverse_sigmoid(float x, float alpha, float beta)
     return log(x1 / x2);
 }
 
+float eltwise_unary_tan(float x, float alpha, float beta)
+{
+    return native_tan(x);
+}
+
 
 #define ELTWISE_UNARY_F32_2D(func_name) \
 __kernel void func_name##_F32toF32_2D \
@@ -218,6 +223,7 @@ ELTWISE_UNARY_F32_2D(atan)
 ELTWISE_UNARY_F32_2D(atanh)
 ELTWISE_UNARY_F32_2D(acosh)
 ELTWISE_UNARY_F32_2D(inverse_sigmoid)
+ELTWISE_UNARY_F32_2D(tan)
 
 #define ELTWISE_UNARY_U8_2D(func_name) \
 __kernel void func_name##_U8toU8_2D \
@@ -261,6 +267,7 @@ ELTWISE_UNARY_U8_2D(atan)
 ELTWISE_UNARY_U8_2D(atanh)
 ELTWISE_UNARY_U8_2D(acosh)
 ELTWISE_UNARY_U8_2D(inverse_sigmoid)
+ELTWISE_UNARY_U8_2D(tan)
 
 #define ELTWISE_UNARY_U8toF32_2D(func_name) \
 __kernel void func_name##_U8toF32_2D \
@@ -303,6 +310,7 @@ ELTWISE_UNARY_U8toF32_2D(atan)
 ELTWISE_UNARY_U8toF32_2D(atanh)
 ELTWISE_UNARY_U8toF32_2D(acosh)
 ELTWISE_UNARY_U8toF32_2D(inverse_sigmoid)
+ELTWISE_UNARY_U8toF32_2D(tan)
 
 __kernel void neg_I32toI32_2D
     (
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_1.cl
index 2adf398..767e8c5 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_1.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary_1.cl
@@ -175,6 +175,11 @@ float eltwise_unary_inverse_sigmoid(float x, float alpha, float beta)
     return log(x1 / x2);
 }
 
+float eltwise_unary_tan(float x, float alpha, float beta)
+{
+    return native_tan(x);
+}
+
 #define ELTWISE_UNARY_F32(func_name) \
 __kernel void func_name##_F32toF32 \
     ( \
@@ -216,6 +221,7 @@ ELTWISE_UNARY_F32(atan)
 ELTWISE_UNARY_F32(atanh)
 ELTWISE_UNARY_F32(acosh)
 ELTWISE_UNARY_F32(inverse_sigmoid)
+ELTWISE_UNARY_F32(tan)
 
 #define ELTWISE_UNARY_U8(func_name) \
 __kernel void func_name##_U8toU8 \
@@ -259,6 +265,7 @@ ELTWISE_UNARY_U8(atan)
 ELTWISE_UNARY_U8(atanh)
 ELTWISE_UNARY_U8(acosh)
 ELTWISE_UNARY_U8(inverse_sigmoid)
+ELTWISE_UNARY_U8(tan)
 
 #define ELTWISE_UNARY_U8toF32(func_name) \
 __kernel void func_name##_U8toF32 \
@@ -301,6 +308,7 @@ ELTWISE_UNARY_U8toF32(atan)
 ELTWISE_UNARY_U8toF32(atanh)
 ELTWISE_UNARY_U8toF32(acosh)
 ELTWISE_UNARY_U8toF32(inverse_sigmoid)
+ELTWISE_UNARY_U8toF32(tan)
 
 __kernel void neg_I32toI32
     (
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/grucell_activation_z_h.cl b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_activation_z_h.cl
index dd2e562..8215ee7 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/grucell_activation_z_h.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_activation_z_h.cl
@@ -31,7 +31,8 @@ __kernel void grucell_activation_z_h_U8_F32toU8_##act_name( \
     __read_only  image2d_t        hstate_h_conv, \
     __write_only image2d_t        output, \
     __write_only image2d_t        hstate_out, \
-    float input_scale, float input_tail, float output_scale, float output_zp) \
+    float input_scale, float input_tail, float output_scale, float output_zp, \
+    float output_scale1, float output_zp1) \
 { \
     int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
     float4  src0, src1, src2, src3; \
@@ -48,10 +49,12 @@ __kernel void grucell_activation_z_h_U8_F32toU8_##act_name( \
     z.x = act_func(z.x); \
     h = tanh_func(h.x); \
     float4 dst = (1 - z ) * h + z * h_tm; \
-    dst = dst * output_scale + output_zp; \
-    uint4 result = convert_uint4_sat_rte(dst); \
+    float4 out0 = dst * output_scale + output_zp; \
+    float4 out1 = dst * output_scale1 + output_zp1; \
+    uint4 result = convert_uint4_sat_rte(out0); \
+    uint4 result1 = convert_uint4_sat_rte(out1); \
     write_imageui(output, coord_in.xy, result); \
-    write_imageui(hstate_out, coord_in.xy, result); \
+    write_imageui(hstate_out, coord_in.xy, result1); \
 }
 GRUCELL_ACTIVATION_U8_F32_U8(SIGMOID, sigmoid)
 //GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)
@@ -65,7 +68,8 @@ __kernel void grucell_activation_z_h_F32_F32toF32_##act_name( \
     __read_only  image2d_t        hstate_h_conv, \
     __write_only image2d_t        output, \
     __write_only image2d_t        hstate_out, \
-    float input_scale, float input_tail, float output_scale, float output_zp) \
+    float input_scale, float input_tail, float output_scale, float output_zp, \
+    float output_scale1, float output_zp1) \
 { \
     int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
     float4  src0, src1, src2, src3; \
@@ -97,7 +101,8 @@ __kernel void grucell_activation_z_h_I32_F32toI32_##act_name( \
     __read_only  image2d_t        hstate_h_conv, \
     __write_only image2d_t        output, \
     __write_only image2d_t        hstate_out, \
-    float input_scale, float input_tail, float output_scale, float output_zp) \
+    float input_scale, float input_tail, float output_scale, float output_zp, \
+    float output_scale1, float output_zp1) \
 { \
     int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
     float4  src0, src1, src2, src3; \
@@ -114,10 +119,12 @@ __kernel void grucell_activation_z_h_I32_F32toI32_##act_name( \
     z.x = act_func(z.x); \
     h = tanh_func(h.x); \
     float4 dst = (1 - z ) * h + z * h_tm; \
-    dst = dst * output_scale + output_zp; \
-    int4 result = convert_int4_sat_rte(dst); \
+    float4 out0 = dst * output_scale + output_zp; \
+    float4 out1 = dst * output_scale1 + output_zp1; \
+    int4 result = convert_int4_sat_rte(out0); \
+    int4 result1 = convert_int4_sat_rte(out1); \
     write_imagei(output, coord_in.xy, result); \
-    write_imagei(hstate_out, coord_in.xy, result); \
+    write_imagei(hstate_out, coord_in.xy, result1); \
 }
 GRUCELL_ACTIVATION_I32_F32_I32(SIGMOID, sigmoid)
-//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)
\ No newline at end of file
+//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/grucell_reset_after_activation.cl b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_reset_after_activation.cl
index a47b32d..45a6c23 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/grucell_reset_after_activation.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_reset_after_activation.cl
@@ -21,6 +21,12 @@ float tanh_func(float x)
     return 2 * x - 1;
 }
 
+float relu_func(float x)
+{
+    x = x > 0 ? x : 0;
+    return x;
+}
+
 
 #define GRUCELL_ACTIVATION_U8_F32_U8(act_name, act_func) \
 __kernel void grucell_reset_after_activation_U8_F32toU8_##act_name( \
@@ -62,6 +68,7 @@ __kernel void grucell_reset_after_activation_U8_F32toU8_##act_name( \
 }
 GRUCELL_ACTIVATION_U8_F32_U8(SIGMOID, sigmoid)
 //GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)
+GRUCELL_ACTIVATION_U8_F32_U8(RELU, relu_func)
 
 #define GRUCELL_ACTIVATION_F32_F32_F32(act_name, act_func) \
 __kernel void grucell_reset_after_activation_F32_F32toF32_##act_name( \
@@ -101,6 +108,7 @@ __kernel void grucell_reset_after_activation_F32_F32toF32_##act_name( \
 
 GRUCELL_ACTIVATION_F32_F32_F32(SIGMOID, sigmoid)
 //GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)
+GRUCELL_ACTIVATION_F32_F32_F32(RELU, relu_func)
 
 #define GRUCELL_ACTIVATION_I32_F32_I32(act_name, act_func) \
 __kernel void grucell_reset_after_activation_I32_F32toI32_##act_name( \
@@ -141,4 +149,5 @@ __kernel void grucell_reset_after_activation_I32_F32toI32_##act_name( \
     write_imagei(hstate_out, coord_in.xy, result); \
 }
 GRUCELL_ACTIVATION_I32_F32_I32(SIGMOID, sigmoid)
-//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)
\ No newline at end of file
+//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)
+GRUCELL_ACTIVATION_I32_F32_I32(RELU, relu_func)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_exceed_axis0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_exceed_axis0.cl
new file mode 100644
index 0000000..e9b8d76
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_exceed_axis0.cl
@@ -0,0 +1,167 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int width;
+_viv_uniform int height;
+
+
+#define rlogE    (0.693147182f)
+float LOG(float x)
+{
+    x = log2(x);
+    return x * rlogE;
+}
+
+__kernel void log_softmax_exceed_axis0_F32toF32(
+    __read_only   image2d_array_t input,
+    __write_only  image2d_array_t output,
+    int   axis, float beta,
+    float scale, float scaleOut, float zpOut)
+{
+    int x = get_global_id(0);
+    int z = get_global_id(1);
+    int4 coord_in = (int4)(0, 0, z, 0);
+    float4 maxValue;
+    float4 src, dst = {0.0};
+
+    maxValue = read_imagef(input, coord_in);
+    for (coord_in.x = 0; coord_in.x < width; coord_in.x++)
+    {
+        for (coord_in.y = 0; coord_in.y < height; coord_in.y++)
+        {
+            src = read_imagef(input, coord_in);
+            maxValue = maxValue > src ? maxValue : src;
+        }
+    }
+
+    // Compute sum.
+    float sum = 0.f;
+    for (coord_in.x = 0; coord_in.x < width; coord_in.x++)
+    {
+        for (coord_in.y = 0; coord_in.y < height; coord_in.y++)
+        {
+            src = read_imagef(input, coord_in);
+            sum += exp2((src.x - maxValue.x) * scale);
+        }
+    }
+
+    // Compute result.
+    float logSum = LOG(sum);
+    for (coord_in.x = 0; coord_in.x < width; coord_in.x++)
+    {
+        for (coord_in.y = 0; coord_in.y < height; coord_in.y++)
+        {
+            src = read_imagef(input, coord_in);
+            dst.x = (src.x - maxValue.x) * beta - logSum;
+            write_imagef(output, coord_in, dst);
+        }
+    }
+}
+
+__kernel void log_softmax_exceed_axis0_U8toU8(
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+    int   axis, float beta,
+    float scale, float scaleOut, float zpOut)
+{
+    int x = get_global_id(0);
+    int z = get_global_id(1);
+    int4 coord_in = (int4)(0, 0, z, 0);
+    float4 maxValue;
+    float4 src;
+    uint4 dst = {0};
+
+    maxValue = convert_float4(read_imageui(input, coord_in));
+    for (coord_in.x = 0; coord_in.x < width; coord_in.x++)
+    {
+        for (coord_in.y = 0; coord_in.y < height; coord_in.y++)
+        {
+            src = convert_float4(read_imageui(input, coord_in));
+            maxValue = maxValue > src ? maxValue : src;
+        }
+    }
+
+    // Compute sum.
+    float sum = 0.f;
+    for (coord_in.x = 0; coord_in.x < width; coord_in.x++)
+    {
+        for (coord_in.y = 0; coord_in.y < height; coord_in.y++)
+        {
+            src = convert_float4(read_imageui(input, coord_in));
+            sum += exp2((src.x - maxValue.x) * scale);
+        }
+    }
+
+    // Compute result.
+    float logSum = LOG(sum);
+    for (coord_in.x = 0; coord_in.x < width; coord_in.x++)
+    {
+        for (coord_in.y = 0; coord_in.y < height; coord_in.y++)
+        {
+            src = convert_float4(read_imageui(input, coord_in));
+
+            dst.x = convert_uint(((src.x - maxValue.x) * beta - logSum) * scaleOut + zpOut);
+
+            write_imageui(output, coord_in, dst);
+        }
+    }
+}
+
+
+__kernel void log_softmax_exceed_axis0_BF16toBF16(
+    __read_only   image2d_array_t input,
+    __write_only  image2d_array_t output,
+    int   axis, float beta,
+    float scale, float scaleOut, float zpOut)
+{
+    int x = get_global_id(0);
+    int z = get_global_id(1);
+    int4 coord_in = (int4)(0, 0, z, 0);
+    float4 maxValue, src, dst = {0.0};
+    uint4 data, val, out;
+
+    data = read_imageui(input, coord_in);
+    data = data << 16;
+    _viv_asm(COPY, maxValue, data, 16);
+    for (coord_in.x = 0; coord_in.x < width; coord_in.x++)
+    {
+        for (coord_in.y = 0; coord_in.y < height; coord_in.y++)
+        {
+            data = read_imageui(input, coord_in);
+            data = data << 16;
+            _viv_asm(COPY, src, data, 16);
+
+            maxValue = maxValue > src ? maxValue : src;
+        }
+    }
+
+    float sum = 0.f;
+    for (coord_in.x = 0; coord_in.x < width; coord_in.x++)
+    {
+        for (coord_in.y = 0; coord_in.y < height; coord_in.y++)
+        {
+            data = read_imageui(input, coord_in);
+            data = data << 16;
+            _viv_asm(COPY, src, data, 16);
+
+            sum += exp2((src.x - maxValue.x) * scale);
+        }
+    }
+
+    float logSum = LOG(sum);
+    for (coord_in.x = 0; coord_in.x < width; coord_in.x++)
+    {
+        for (coord_in.y = 0; coord_in.y < height; coord_in.y++)
+        {
+            data = read_imageui(input, coord_in);
+            data = data << 16;
+            _viv_asm(COPY, src, data, 16);
+
+            dst.x = (src.x - maxValue.x) * beta - logSum;
+            _viv_asm(COPY, val, dst, 16);
+            out = val >> 16;
+            write_imageui(output, coord_in, out);
+        }
+    }
+}
+#undef rlogE
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_exceed_axis1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_exceed_axis1.cl
new file mode 100644
index 0000000..f6d0afc
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/log_softmax_exceed_axis1.cl
@@ -0,0 +1,172 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int height;
+_viv_uniform int depth;
+
+#define rlogE    (0.693147182f)
+
+float LOG(float x)
+{
+    x = log2(x);
+    return x * rlogE;
+}
+
+__kernel void log_softmax_exceed_axis1_F32toF32(
+    __read_only   image2d_array_t input,
+    __write_only  image2d_array_t output,
+    int axis, float beta,
+    float scale, float scaleOut, float zpOut)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int4 coord_in = (int4)(x, 0, 0, 0);
+    float4 maxValue;
+    float4 src, dst = {0.0};
+
+    maxValue = read_imagef(input, coord_in);
+    for (coord_in.y = 0; coord_in.y < height; coord_in.y++)
+    {
+        for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)
+        {
+            src = read_imagef(input, coord_in);
+            maxValue = maxValue > src ? maxValue : src;
+        }
+    }
+
+    // Compute sum.
+    float sum = 0.f;
+    for (coord_in.y = 0; coord_in.y < height; coord_in.y++)
+    {
+        for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)
+        {
+            src = read_imagef(input, coord_in);
+            sum += exp2((src.x - maxValue.x) * scale);
+        }
+    }
+
+    // Compute result.
+    float logSum = LOG(sum);
+    for (coord_in.y = 0; coord_in.y < height; coord_in.y++)
+    {
+        for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)
+        {
+            src = read_imagef(input, coord_in);
+
+            dst.x = (src.x - maxValue.x) * beta - logSum;
+            write_imagef(output, coord_in, dst);
+        }
+    }
+}
+
+__kernel void log_softmax_exceed_axis1_U8toU8(
+    __read_only  image2d_array_t input,
+    __write_only image2d_array_t output,
+    int axis, float beta,
+    float scale, float scaleOut, float zpOut)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int4 coord_in = (int4)(x, 0, 0, 0);
+    float4 maxValue;
+    float4 src;
+    uint4 dst = {0};
+
+    maxValue = convert_float4(read_imageui(input, coord_in));
+    for (coord_in.y = 0; coord_in.y < height; coord_in.y++)
+    {
+        for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)
+        {
+            src = convert_float4(read_imageui(input, coord_in));
+
+            maxValue = maxValue > src ? maxValue : src;
+        }
+    }
+
+    // Compute sum.
+    float sum = 0.f;
+    for (coord_in.y = 0; coord_in.y < height; coord_in.y++)
+    {
+        for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)
+        {
+            src = convert_float4(read_imageui(input, coord_in));
+
+            sum += exp2((src.x - maxValue.x) * scale);
+        }
+    }
+
+    // Compute result.
+    float logSum = LOG(sum);
+    for (coord_in.y = 0; coord_in.y < height; coord_in.y++)
+    {
+        for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)
+        {
+            src = convert_float4(read_imageui(input, coord_in));
+
+            dst.x = convert_uint(((src.x - maxValue.x) * beta - logSum) * scaleOut + zpOut);
+
+            write_imageui(output, coord_in, dst);
+        }
+    }
+}
+
+__kernel void log_softmax_exceed_axis1_BF16oBF16(
+    __read_only   image2d_array_t input,
+    __write_only  image2d_array_t output,
+        int axis, float beta,
+        float scale, float scaleOut, float zpOut)
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int4 coord_in = (int4)(x, 0, 0, 0);
+    float4 maxValue, src, dst = {0.0};
+    uint4 data, val, out;
+
+    data = read_imageui(input, coord_in);
+    data = data << 16;
+    _viv_asm(COPY, maxValue, data, 16);
+    for (coord_in.y = 0; coord_in.y < height; coord_in.y++)
+    {
+        for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)
+        {
+            data = read_imageui(input, coord_in);
+            data = data << 16;
+            _viv_asm(COPY, src, data, 16);
+
+            maxValue = maxValue > src ? maxValue : src;
+        }
+    }
+
+    float sum = 0.f;
+    for (coord_in.y = 0; coord_in.y < height; coord_in.y++)
+    {
+        for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)
+        {
+            data = read_imageui(input, coord_in);
+            data = data << 16;
+            _viv_asm(COPY, src, data, 16);
+
+            sum += exp2((src.x - maxValue.x) * scale);
+        }
+    }
+
+    float logSum = LOG(sum);
+    for (coord_in.y = 0; coord_in.y < height; coord_in.y++)
+    {
+        for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)
+        {
+            data = read_imageui(input, coord_in);
+            data = data << 16;
+            _viv_asm(COPY, src, data, 16);
+
+            dst.x = (src.x - maxValue.x) * beta - logSum;
+
+            _viv_asm(COPY, val, dst, 16);
+            out = val >> 16;
+
+            write_imageui(output, coord_in, out);
+        }
+    }
+}
+
+#undef rlogE
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_4x.cl b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_4x.cl
index e4cc547..4de7918 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_4x.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/matrixmul_4x.cl
@@ -123,5 +123,133 @@ __kernel void gemm_4x_transa_F32F32toF32_2D(
 
 }
 
+__kernel __attribute__((reqd_work_group_size(1, 64, 1)))
+    void gemm_4x_transa_local_F32F32toF32_2D(
+    __read_only  image2d_t inputA,
+    __read_only  image2d_t inputB,
+    __write_only image2d_t output,
+    int M,
+    int K,
+    int N,
+    int ac2zero,
+    int bc2zero,
+    float scale_a,
+    float zp_a,
+    float scale_b,
+    float zp_b,
+    float scale_out,
+    float zp_out
+    )
+{
+    int offset0 = get_global_id(0);
+    int lid = get_local_id(1);
 
+    int stride = 0;
+
+    int z = 0;
+    int offset1 = M << 2;
+    int step = K >> 8;
+    int lid2 = lid * 4 * step;
+
+    Image in0_tensor = create_image_from_image2d(inputA, 4);
+    __global float* in0_ptr0 = (__global float*)in0_tensor.ptr + offset0 + lid2 * M;
+    __global float* in0_ptr1 = in0_ptr0 + M;
+    __global float* in0_ptr2 = in0_ptr1 + M;
+    __global float* in0_ptr3 = in0_ptr2 + M;
+
+    Image in1_tensor = create_image_from_image2d(inputB, 4);
+    __global float* in1_ptr = (__global float*)in1_tensor.ptr + lid2;
+
+    Image o_tensor = create_image_from_image2d(output, 4);
+    __global float* output_ptr = (__global float*)o_tensor.ptr + offset0;
+
+    __local float4 sum_vec4_0[64];
+    __local float4 sum_vec4_1[64];
+    __local float4 sum_vec4_2[64];
+    __local float4 sum_vec4_3[64];
+
+    float4 sum0 = (float4)(0.0, 0.0, 0.0, 0.0);
+    float4 sum1 = (float4)(0.0, 0.0, 0.0, 0.0);
+    float4 sum2 = (float4)(0.0, 0.0, 0.0, 0.0);
+    float4 sum3 = (float4)(0.0, 0.0, 0.0, 0.0);
+
+    float4 tempA0, tempA1, tempA2, tempA3;
+    float4 tempA4, tempA5, tempA6, tempA7;
+    float4 tempB0;
+
+    for(z = 0; z < step; z++)
+    {
+        tempB0 = vload4(z, in1_ptr);
+        tempA0 = vload4(0, in0_ptr0);
+        tempA1 = vload4(0, in0_ptr1);
+        tempA2 = vload4(0, in0_ptr2);
+        tempA3 = vload4(0, in0_ptr3);
+        tempA4 = vload4(1, in0_ptr0);
+        tempA5 = vload4(1, in0_ptr1);
+        tempA6 = vload4(1, in0_ptr2);
+        tempA7 = vload4(1, in0_ptr3);
+
+        sum0 = sum0 + tempA0 * tempB0.x;
+        sum0 = sum0 + tempA1 * tempB0.y;
+        sum0 = sum0 + tempA2 * tempB0.z;
+        sum0 = sum0 + tempA3 * tempB0.w;
+        sum1 = sum1 + tempA4 * tempB0.x;
+        sum1 = sum1 + tempA5 * tempB0.y;
+        sum1 = sum1 + tempA6 * tempB0.z;
+        sum1 = sum1 + tempA7 * tempB0.w;
+
+        tempA0 = vload4(2, in0_ptr0);
+        tempA1 = vload4(2, in0_ptr1);
+        tempA2 = vload4(2, in0_ptr2);
+        tempA3 = vload4(2, in0_ptr3);
+        tempA4 = vload4(3, in0_ptr0);
+        tempA5 = vload4(3, in0_ptr1);
+        tempA6 = vload4(3, in0_ptr2);
+        tempA7 = vload4(3, in0_ptr3);
+
+        in0_ptr0 = in0_ptr0 + offset1;
+        in0_ptr1 = in0_ptr1 + offset1;
+        in0_ptr2 = in0_ptr2 + offset1;
+        in0_ptr3 = in0_ptr3 + offset1;
+
+        sum2 = sum2 + tempA0 * tempB0.x;
+        sum2 = sum2 + tempA1 * tempB0.y;
+        sum2 = sum2 + tempA2 * tempB0.z;
+        sum2 = sum2 + tempA3 * tempB0.w;
+        sum3 = sum3 + tempA4 * tempB0.x;
+        sum3 = sum3 + tempA5 * tempB0.y;
+        sum3 = sum3 + tempA6 * tempB0.z;
+        sum3 = sum3 + tempA7 * tempB0.w;
+    }
+    sum_vec4_0[lid] = sum0;
+    sum_vec4_1[lid] = sum1;
+    sum_vec4_2[lid] = sum2;
+    sum_vec4_3[lid] = sum3;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (stride = 32; stride > 0; stride >>= 1)
+    {
+        if (lid < stride)
+        {
+            sum_vec4_0[lid] += sum_vec4_0[lid + stride];
+            sum_vec4_1[lid] += sum_vec4_1[lid + stride];
+            sum_vec4_2[lid] += sum_vec4_2[lid + stride];
+            sum_vec4_3[lid] += sum_vec4_3[lid + stride];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (lid == 0)
+    {
+        sum0 = sum_vec4_0[0];
+        sum1 = sum_vec4_1[0];
+        sum2 = sum_vec4_2[0];
+        sum3 = sum_vec4_3[0];
+        vstore4(sum0, 0, output_ptr);
+        vstore4(sum1, 1, output_ptr);
+        vstore4(sum2, 2, output_ptr);
+        vstore4(sum3, 3, output_ptr);
+    }
+}
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/resize_cubic.cl b/src/tim/vx/internal/src/libnnext/ops/cl/resize_cubic.cl
new file mode 100644
index 0000000..7cdce1c
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/resize_cubic.cl
@@ -0,0 +1,195 @@
+__kernel void resize_cubic_F32toF32(
+    __read_only  image2d_array_t  input,
+    __write_only image2d_array_t  output,
+                           float  scale_x,
+                           float  scale_y,
+                           float  half_pixel_value
+                           )
+{
+    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    float  cubic_coeffs_y[4] = {0,0,0,0};
+    float  cubic_coeffs_x[4] = {0,0,0,0};
+    float  in_x         = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value;
+    float  left_x_f     = floor(in_x);
+    float4 delta_x      = (float4)(0, in_x - left_x_f,0,0);
+    float  in_y         = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value;
+    float  top_y_f      = floor(in_y);
+    float4 delta_y      = (float4)(0, in_y - top_y_f,0,0);
+    int    x_idx        = convert_int(left_x_f - 1);
+    int    y_idx        = convert_int(top_y_f - 1);
+    int4   coord_in     = (int4)(x_idx, y_idx, coord_out.z, 0);
+    float  data00, data01, data02, data03, data10, data11, data12, data13,
+           data20, data21, data22, data23, data30, data31, data32, data33;
+
+    delta_x.x = 1 + delta_x.y;
+    delta_x.z = 1 - delta_x.y;
+    delta_x.w = 2 - delta_x.y;
+    cubic_coeffs_x[0] = -0.5 * ((((delta_x.x - 5) * delta_x.x + 8) * delta_x.x) - 4);
+    cubic_coeffs_x[1] = (1.5 * delta_x.y - 2.5) * delta_x.y * delta_x.y + 1;
+    cubic_coeffs_x[2] = (1.5 * delta_x.z - 2.5) * delta_x.z * delta_x.z + 1;
+    cubic_coeffs_x[3] = -0.5 * ((((delta_x.w - 5) * delta_x.w + 8) * delta_x.w) - 4);
+    delta_y.x = 1 + delta_y.y;
+    delta_y.z = 1 - delta_y.y;
+    delta_y.w = 2 - delta_y.y;
+    cubic_coeffs_y[0] = -0.5 * ((((delta_y.x - 5) * delta_y.x + 8) * delta_y.x) - 4);
+    cubic_coeffs_y[1] = (1.5 * delta_y.y - 2.5) * delta_y.y * delta_y.y + 1;
+    cubic_coeffs_y[2] = (1.5 * delta_y.z - 2.5) * delta_y.z * delta_y.z + 1;
+    cubic_coeffs_y[3] = -0.5 * ((((delta_y.w - 5) * delta_y.w + 8) * delta_y.w) - 4);
+    float4 dst = (float4)(0,0,0,0);
+
+    data00   = read_imagef(input, coord_in).x;
+    coord_in.x++;
+    data10   = read_imagef(input, coord_in).x;
+    coord_in.x++;
+    data20   = read_imagef(input, coord_in).x;
+    coord_in.x++;
+    data30   = read_imagef(input, coord_in).x;
+
+    coord_in.y++;
+    data31   = read_imagef(input, coord_in).x;
+    coord_in.x--;
+    data21   = read_imagef(input, coord_in).x;
+    coord_in.x--;
+    data11   = read_imagef(input, coord_in).x;
+    coord_in.x--;
+    data01   = read_imagef(input, coord_in).x;
+
+    coord_in.y++;
+    data02   = read_imagef(input, coord_in).x;
+    coord_in.x++;
+    data12   = read_imagef(input, coord_in).x;
+    coord_in.x++;
+    data22   = read_imagef(input, coord_in).x;
+    coord_in.x++;
+    data32   = read_imagef(input, coord_in).x;
+
+    coord_in.y++;
+    data33   = read_imagef(input, coord_in).x;
+    coord_in.x--;
+    data23   = read_imagef(input, coord_in).x;
+    coord_in.x--;
+    data13   = read_imagef(input, coord_in).x;
+    coord_in.x--;
+    data03   = read_imagef(input, coord_in).x;
+
+    dst.x = data00 * cubic_coeffs_x[0] * cubic_coeffs_y[0]
+          + data01 * cubic_coeffs_x[0] * cubic_coeffs_y[1]
+          + data02 * cubic_coeffs_x[0] * cubic_coeffs_y[2]
+          + data03 * cubic_coeffs_x[0] * cubic_coeffs_y[3]
+          + data10 * cubic_coeffs_x[1] * cubic_coeffs_y[0]
+          + data11 * cubic_coeffs_x[1] * cubic_coeffs_y[1]
+          + data12 * cubic_coeffs_x[1] * cubic_coeffs_y[2]
+          + data13 * cubic_coeffs_x[1] * cubic_coeffs_y[3]
+          + data20 * cubic_coeffs_x[2] * cubic_coeffs_y[0]
+          + data21 * cubic_coeffs_x[2] * cubic_coeffs_y[1]
+          + data22 * cubic_coeffs_x[2] * cubic_coeffs_y[2]
+          + data23 * cubic_coeffs_x[2] * cubic_coeffs_y[3]
+          + data30 * cubic_coeffs_x[3] * cubic_coeffs_y[0]
+          + data31 * cubic_coeffs_x[3] * cubic_coeffs_y[1]
+          + data32 * cubic_coeffs_x[3] * cubic_coeffs_y[2]
+          + data33 * cubic_coeffs_x[3] * cubic_coeffs_y[3];
+
+    write_imagef(output, coord_out, dst);
+
+}
+
+
+__kernel void resize_cubic_U8toU8(
+    __read_only  image2d_array_t  input,
+    __write_only image2d_array_t  output,
+                           float  scale_x,
+                           float  scale_y,
+                           float  half_pixel_value,
+                           float  in_scale,
+                           float  in_tail,
+                           float  out_scale,
+                           float  out_tail
+                           )
+{
+    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    float  cubic_coeffs_y[4] = {0,0,0,0};
+    float  cubic_coeffs_x[4] = {0,0,0,0};
+    float  in_x         = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value;
+    float  left_x_f     = floor(in_x);
+    float4 delta_x      = (float4)(0, in_x - left_x_f,0,0);
+    float  in_y         = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value;
+    float  top_y_f      = floor(in_y);
+    float4 delta_y      = (float4)(0, in_y - top_y_f,0,0);
+    int    x_idx        = convert_int(left_x_f - 1);
+    int    y_idx        = convert_int(top_y_f - 1);
+    int4   coord_in     = (int4)(x_idx, y_idx, coord_out.z, 0);
+    float  data00, data01, data02, data03, data10, data11, data12, data13,
+           data20, data21, data22, data23, data30, data31, data32, data33;
+
+    delta_x.x = 1 + delta_x.y;
+    delta_x.z = 1 - delta_x.y;
+    delta_x.w = 2 - delta_x.y;
+    cubic_coeffs_x[0] = -0.5 * ((((delta_x.x - 5) * delta_x.x + 8) * delta_x.x) - 4);
+    cubic_coeffs_x[1] = (1.5 * delta_x.y - 2.5) * delta_x.y * delta_x.y + 1;
+    cubic_coeffs_x[2] = (1.5 * delta_x.z - 2.5) * delta_x.z * delta_x.z + 1;
+    cubic_coeffs_x[3] = -0.5 * ((((delta_x.w - 5) * delta_x.w + 8) * delta_x.w) - 4);
+    delta_y.x = 1 + delta_y.y;
+    delta_y.z = 1 - delta_y.y;
+    delta_y.w = 2 - delta_y.y;
+    cubic_coeffs_y[0] = -0.5 * ((((delta_y.x - 5) * delta_y.x + 8) * delta_y.x) - 4);
+    cubic_coeffs_y[1] = (1.5 * delta_y.y - 2.5) * delta_y.y * delta_y.y + 1;
+    cubic_coeffs_y[2] = (1.5 * delta_y.z - 2.5) * delta_y.z * delta_y.z + 1;
+    cubic_coeffs_y[3] = -0.5 * ((((delta_y.w - 5) * delta_y.w + 8) * delta_y.w) - 4);
+    float dst = 0;
+    uint4 out = (uint4)(0,0,0,0);
+
+    data00   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;
+    coord_in.x++;
+    data10   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;
+    coord_in.x++;
+    data20   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;
+    coord_in.x++;
+    data30   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;
+
+    coord_in.y++;
+    data31   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;
+    coord_in.x--;
+    data21   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;
+    coord_in.x--;
+    data11   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;
+    coord_in.x--;
+    data01   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;
+
+    coord_in.y++;
+    data02   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;
+    coord_in.x++;
+    data12   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;
+    coord_in.x++;
+    data22   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;
+    coord_in.x++;
+    data32   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;
+
+    coord_in.y++;
+    data33   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;
+    coord_in.x--;
+    data23   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;
+    coord_in.x--;
+    data13   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;
+    coord_in.x--;
+    data03   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;
+
+    dst = data00 * cubic_coeffs_x[0] * cubic_coeffs_y[0]
+        + data01 * cubic_coeffs_x[0] * cubic_coeffs_y[1]
+        + data02 * cubic_coeffs_x[0] * cubic_coeffs_y[2]
+        + data03 * cubic_coeffs_x[0] * cubic_coeffs_y[3]
+        + data10 * cubic_coeffs_x[1] * cubic_coeffs_y[0]
+        + data11 * cubic_coeffs_x[1] * cubic_coeffs_y[1]
+        + data12 * cubic_coeffs_x[1] * cubic_coeffs_y[2]
+        + data13 * cubic_coeffs_x[1] * cubic_coeffs_y[3]
+        + data20 * cubic_coeffs_x[2] * cubic_coeffs_y[0]
+        + data21 * cubic_coeffs_x[2] * cubic_coeffs_y[1]
+        + data22 * cubic_coeffs_x[2] * cubic_coeffs_y[2]
+        + data23 * cubic_coeffs_x[2] * cubic_coeffs_y[3]
+        + data30 * cubic_coeffs_x[3] * cubic_coeffs_y[0]
+        + data31 * cubic_coeffs_x[3] * cubic_coeffs_y[1]
+        + data32 * cubic_coeffs_x[3] * cubic_coeffs_y[2]
+        + data33 * cubic_coeffs_x[3] * cubic_coeffs_y[3];
+    out.x = convert_uint(dst * out_scale + out_tail);
+
+    write_imageui(output, coord_out, out);
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd_update_reduction.cl b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd_update_reduction.cl
new file mode 100644
index 0000000..ff57204
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd_update_reduction.cl
@@ -0,0 +1,203 @@
+
+inline void AtomicAdd_float(volatile __global float *source, const float operand)
+{
+    union
+    {
+        unsigned int intVal;
+        float floatVal;
+    } newVal;
+    union
+    {
+        unsigned int intVal;
+        float floatVal;
+    } prevVal;
+    do
+    {
+        prevVal.floatVal = *source;
+        newVal.floatVal = prevVal.floatVal + operand;
+    } while(atomic_cmpxchg((volatile __global unsigned int *)source,
+                             prevVal.intVal, newVal.intVal) != prevVal.intVal);
+}
+
+inline void AtomicMul_float(volatile __global float *source, const float operand)
+{
+    union
+    {
+        unsigned int intVal;
+        float floatVal;
+    } newVal;
+    union
+    {
+        unsigned int intVal;
+        float floatVal;
+    } prevVal;
+    do
+    {
+        prevVal.floatVal = *source;
+        newVal.floatVal = prevVal.floatVal * operand;
+    } while(atomic_cmpxchg((volatile __global unsigned int *)source,
+                             prevVal.intVal, newVal.intVal) != prevVal.intVal);
+}
+
+inline void AtomicMax_float(volatile __global float *source, const float operand)
+{
+    union
+    {
+        unsigned int intVal;
+        float floatVal;
+    } newVal;
+    union
+    {
+        unsigned int intVal;
+        float floatVal;
+    } prevVal;
+    do
+    {
+        prevVal.floatVal = *source;
+        newVal.floatVal = fmax(prevVal.floatVal, operand);
+    } while(atomic_cmpxchg((volatile __global unsigned int *)source,
+                             prevVal.intVal, newVal.intVal) != prevVal.intVal);
+}
+
+inline void AtomicMin_float(volatile __global float *source, const float operand)
+{
+    union
+    {
+        unsigned int intVal;
+        float floatVal;
+    } newVal;
+    union
+    {
+        unsigned int intVal;
+        float floatVal;
+    } prevVal;
+    do
+    {
+        prevVal.floatVal = *source;
+        newVal.floatVal = fmin(prevVal.floatVal, operand);
+    } while(atomic_cmpxchg((volatile __global unsigned int *)source,
+                             prevVal.intVal, newVal.intVal) != prevVal.intVal);
+}
+
+#define SCATTER_REDUCTION_PREPROCESS(name0, ptr0, type0, size0, ptr2) \
+__kernel void scatter_nd_update_reduction_preprocess_##name0( \
+    __read_only image2d_t   input_ref, \
+    image2d_t  temp_buf_float, \
+    int length, int res, float input_scale, float zp_scale) \
+{ \
+    int gidx = get_global_id(0); \
+    Image img1 = create_image_from_image2d(input_ref, size0); \
+    Image img2 = create_image_from_image2d(temp_buf_float, 4); \
+    __global float* tmp_ref_ptr = (__global float*)img2.ptr; \
+    type0 src0, src1; \
+    float4 tmpDst0, tmpDst1; \
+    __global ptr2* input_ptr = (__global ptr2*)img1.ptr; \
+    if(length > 0) \
+    { \
+        int loc2 = gidx * 8; \
+        ptr0 tmpData0 = vload4(0, input_ptr + loc2); \
+        ptr0 tmpData1 = vload4(1, input_ptr + loc2); \
+        _viv_asm(COPY, src0, tmpData0, 16); \
+        _viv_asm(COPY, src1, tmpData1, 16); \
+        _viv_asm(CONV, tmpDst0, src0); \
+        _viv_asm(CONV, tmpDst1, src1); \
+        tmpDst0 = tmpDst0 * input_scale + zp_scale; \
+        tmpDst1 = tmpDst1 * input_scale + zp_scale; \
+        vstore4(tmpDst0, 0, tmp_ref_ptr + loc2); \
+        vstore4(tmpDst1, 1, tmp_ref_ptr + loc2); \
+    } \
+    for(int i = gidx; i < res; i += get_global_size(0)) \
+    { \
+        ptr2 tmpData0 = input_ptr[length + i]; \
+        _viv_asm(COPY, src0, tmpData0, 4); \
+        _viv_asm(CONV, tmpDst0, src0); \
+        tmpDst0.x = tmpDst0.x * input_scale + zp_scale; \
+        tmp_ref_ptr[length + i] = tmpDst0.x; \
+    } \
+}
+SCATTER_REDUCTION_PREPROCESS(U8,  uchar4, uchar4, 1, uchar)
+SCATTER_REDUCTION_PREPROCESS(I8,  char4,  char4,  1, char)
+SCATTER_REDUCTION_PREPROCESS(I16, short4, short4, 2, short)
+SCATTER_REDUCTION_PREPROCESS(F16, short4, half4,  2, short)
+SCATTER_REDUCTION_PREPROCESS(F32, float4, float4, 4, float)
+
+#define SCATTER_ND_REDUCTION_PROCESS_F16(name0, func) \
+__kernel void scatter_nd_update_reduction_##name0##_F16( \
+    __read_only image2d_t   index, \
+    __read_only image2d_t   update, \
+    image2d_t  temp_buf_float, \
+    image2d_t  link_buffer0, \
+    int val0, int val1, int val2, int val3, int val4, int val5, int val6, \
+    int coord_dim, int update_width, int output_width, float update_scale, float zp_scale) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    Image img1 = create_image_from_image2d(index, 4); \
+    Image img2 = create_image_from_image2d(update, 2); \
+    Image img3 = create_image_from_image2d(temp_buf_float, 4); \
+    __global int* index_ptr = (__global int*)img1.ptr; \
+    __global short* update_ptr = (__global short*)img2.ptr; \
+    __global float* output_ptr = (__global float*)img3.ptr; \
+    half src; \
+ \
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim); \
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \
+    short tmpData = update_ptr[gidy * update_width + gidx]; \
+    int idx = indice.x * val0 + indice.y * val1 + indice.z * val2 + indice.w * val3; \
+    idx = idx + indice1.x * val4 + indice1.y * val5 + indice1.z * val6; \
+    int loc = idx * output_width + gidx; \
+    _viv_asm(COPY, src, tmpData, 4); \
+    float data; \
+    _viv_asm(CONV, data, src); \
+    func(output_ptr + loc, data); \
+}
+SCATTER_ND_REDUCTION_PROCESS_F16(Add,  AtomicAdd_float)
+SCATTER_ND_REDUCTION_PROCESS_F16(Mul,  AtomicMul_float)
+SCATTER_ND_REDUCTION_PROCESS_F16(Max,  AtomicMax_float)
+SCATTER_ND_REDUCTION_PROCESS_F16(Min,  AtomicMin_float)
+
+#define SCATTER_ND_UPDATE_PROCESS_QINT(name0, src0_type, ptr_type, element_size, func) \
+__kernel void scatter_nd_update_reduction_##name0##_##src0_type( \
+    __read_only image2d_t   index, \
+    __read_only image2d_t   update, \
+    image2d_t  temp_buf_float, \
+    image2d_t  link_buffer0, \
+    int val0, int val1, int val2, int val3, int val4, int val5, int val6, \
+    int coord_dim, int update_width, int output_width, float update_scale, float zp_scale) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    Image img1 = create_image_from_image2d(index, 4); \
+    Image img2 = create_image_from_image2d(update, element_size); \
+    Image img3 = create_image_from_image2d(temp_buf_float, 4); \
+    __global int* index_ptr = (__global int*)img1.ptr; \
+    __global ptr_type* update_ptr = (__global ptr_type*)img2.ptr; \
+    __global float* output_ptr = (__global float*)img3.ptr; \
+ \
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim); \
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \
+    ptr_type tmpData = update_ptr[gidy * update_width + gidx]; \
+    int idx = indice.x * val0 + indice.y * val1 + indice.z * val2 + indice.w * val3; \
+    idx = idx + indice1.x * val4 + indice1.y * val5 + indice1.z * val6; \
+    int loc = idx * output_width + gidx; \
+    float data; \
+    _viv_asm(CONV, data, tmpData); \
+    data = data * update_scale + zp_scale; \
+    func(output_ptr + loc, data); \
+}
+SCATTER_ND_UPDATE_PROCESS_QINT(Add, U8,  uchar, 1, AtomicAdd_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Mul, U8,  uchar, 1, AtomicMul_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Max, U8,  uchar, 1, AtomicMax_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Min, U8,  uchar, 1, AtomicMin_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Add, I8,  char,  1, AtomicAdd_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Mul, I8,  char,  1, AtomicMul_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Max, I8,  char,  1, AtomicMax_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Min, I8,  char,  1, AtomicMin_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Add, I16, short, 2, AtomicAdd_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Mul, I16, short, 2, AtomicMul_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Max, I16, short, 2, AtomicMax_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Min, I16, short, 2, AtomicMin_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Add, F32, float, 4, AtomicAdd_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Mul, F32, float, 4, AtomicMul_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Max, F32, float, 4, AtomicMax_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Min, F32, float, 4, AtomicMin_float)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd_update_reduction_conv.cl b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd_update_reduction_conv.cl
new file mode 100644
index 0000000..80e07f7
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_nd_update_reduction_conv.cl
@@ -0,0 +1,72 @@
+__kernel void scatter_nd_update_reduction_conv_F16(
+    __read_only image2d_t  temp_buf_float,
+    __read_only image2d_t  link_buf,
+    image2d_t  output,
+    int length, int res, float output_scale, float output_zp)
+{
+    int gidx = get_global_id(0);
+    Image img1 = create_image_from_image2d(temp_buf_float, 4);
+    Image img2 = create_image_from_image2d(output, 2);
+    __global float* input_ptr = (__global float*)img1.ptr;
+    __global short* output_ptr = (__global short*)img2.ptr;
+    if(length > 0)
+    {
+        int offset = gidx * 8;
+        float4 src0 = vload4(0, input_ptr + offset);
+        float4 src1 = vload4(1, input_ptr + offset);
+        half4 data0, data1;
+        _viv_asm(CONV, data0, src0);
+        _viv_asm(CONV, data1, src1);
+        short4 dst0, dst1;
+        _viv_asm(COPY, dst0, data0, 16);
+        _viv_asm(COPY, dst1, data1, 16);
+        vstore4(dst0, 0, output_ptr + offset);
+        vstore4(dst1, 1, output_ptr + offset);
+    }
+    for(int i = gidx; i < res; i += get_global_size(0))
+    {
+        float src = input_ptr[length + i];
+        half data;
+        _viv_asm(CONV, data, src);
+        short dst;
+        _viv_asm(COPY, dst, data, 4);
+        output_ptr[length + i] = dst;
+    }
+}
+
+#define SCATTER_ND_UPDATE_CONV(src0_type, ptr_type, element_size, ptr_type1, conv_func) \
+__kernel void scatter_nd_update_reduction_conv_##src0_type( \
+    __read_only image2d_t  temp_buf_float, \
+    __read_only image2d_t  link_buf, \
+    image2d_t  output, \
+    int length, int res, float output_scale, float output_zp) \
+{ \
+    int gidx = get_global_id(0); \
+    Image img1 = create_image_from_image2d(temp_buf_float, 4); \
+    Image img2 = create_image_from_image2d(output, element_size); \
+    __global float* input_ptr = (__global float*)img1.ptr; \
+    __global ptr_type1* output_ptr = (__global ptr_type1*)img2.ptr; \
+    if(length > 0) \
+    { \
+        int offset = gidx * 8; \
+        float4 src0 = vload4(0, input_ptr + offset); \
+        float4 src1 = vload4(1, input_ptr + offset); \
+        int4 data0 = convert_int4_rte(src0 * output_scale + output_zp); \
+        int4 data1 = convert_int4_rte(src1 * output_scale + output_zp); \
+        ptr_type dst0, dst1; \
+        _viv_asm(CONV, dst0, data0); \
+        _viv_asm(CONV, dst1, data1); \
+        vstore4(dst0, 0, output_ptr + offset); \
+        vstore4(dst1, 1, output_ptr + offset); \
+    } \
+    for(int i = gidx; i < res; i += get_global_size(0)) \
+    { \
+        float src = input_ptr[length + i]; \
+        int data = convert_int_rte(src * output_scale + output_zp); \
+        output_ptr[length + i] = conv_func(data); \
+    } \
+}
+SCATTER_ND_UPDATE_CONV(U8,  uchar4, 1, uchar, convert_uchar)
+SCATTER_ND_UPDATE_CONV(I8,  char4,  1, char,  convert_char)
+SCATTER_ND_UPDATE_CONV(I16, short4, 2, short, convert_short)
+SCATTER_ND_UPDATE_CONV(F32, float4, 4, float, convert_float)
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/swish.cl b/src/tim/vx/internal/src/libnnext/ops/cl/swish.cl
index 95254d2..d457a36 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/swish.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/swish.cl
@@ -121,7 +121,7 @@ __kernel void swish_I32toI32_2D(
     src = read_imagef(input, coord); \
     tmp.x = sigmoid_(src.x * beta, logE); \
     data.x = src.x * tmp.x; \
-    uint4 dst = convert_uint4(data * outputScale + outputZP); \
+    uint4 dst = convert_uint4_rte(data * outputScale + outputZP); \
     write_imageui(output, coord, dst);
 
 __kernel void swish_F32toU8(
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/crop_and_resize_bilinear.vx b/src/tim/vx/internal/src/libnnext/ops/vx/crop_and_resize_bilinear.vx
new file mode 100644
index 0000000..5e126ec
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/crop_and_resize_bilinear.vx
@@ -0,0 +1,255 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float inOutScale;
+_viv_uniform float inOutTile;
+_viv_uniform float width_scale;
+_viv_uniform float height_scale;
+_viv_uniform int   image_width;
+_viv_uniform int   image_height;
+_viv_uniform VXC_512Bits uniRightToFp32_4x4;
+_viv_uniform VXC_512Bits uniLeftToFp32_4x4;
+_viv_uniform VXC_512Bits uniExtract8Bit_2x8;
+_viv_uniform VXC_512Bits uniExtractHalf8_2x8;
+
+#define CROP_AND_RESIZE_PART0 \
+    int i = 0; \
+    int bb = get_global_id(2); \
+    int y =  get_global_id(1); \
+    int4 x = (int4)(get_global_id(0),get_global_id(0) + 1, get_global_id(0) + 2, get_global_id(0) + 3); \
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    int2 coord_box_ind = (int2)(bb, 0); \
+    int b = read_imagei(box_ind, coord_box_ind).x; \
+    float4 xy, in_x; \
+    float in_y; \
+    float4 x_lerp, y_lerp; \
+    int d = 0; \
+    Image img_boxes = create_image_from_image2d(boxes, 2); \
+    __global half* boxes_ptr = (__global half*)img_boxes.ptr; \
+    xy = vload_half4(bb, boxes_ptr); \
+    float _width_scale = convert_float(xy.w - xy.y) * width_scale; \
+    float _height_scale = convert_float(xy.z - xy.x) * height_scale; \
+ \
+    if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \
+    if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \
+    in_y = xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale; \
+    y_lerp.x = in_y - floor(in_y); \
+    y_lerp.yzw = y_lerp.xxx;
+
+#define CROP_AND_RESIZE_PART1 \
+        int4 coord = (int4)(0, in_y, d + b * ori_depth, 0); \
+        int8 input_desc, output_desc; \
+ \
+        coord_out.z = d + coord_out.z * ori_depth; \
+ \
+        _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+        int baseAddr = (int)coord.z * input_desc.s4 + input_desc.s0; \
+        _viv_asm(MOV, coord.w, baseAddr); \
+ \
+        _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+        baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+        _viv_asm(MOV, coord_out.w, baseAddr); \
+        in_x.x = xy.y * convert_float(image_width - 1); \
+        in_x.yzw = in_x.xxx; \
+        in_x = in_x + convert_float4(x) * _width_scale; \
+        x_lerp = in_x - floor(in_x); \
+        coord.x = floor(in_x.x); \
+        VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                                         VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, src1, input, coord.xywz, VXC_5BITOFFSET_XY(0, 1), \
+                                     VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord.x = floor(in_x.y); \
+        VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                                         VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, src1, input, coord.xywz, VXC_5BITOFFSET_XY(0, 1), \
+                                     VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        coord.x = floor(in_x.z); \
+        VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                                         VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, src1, input, coord.xywz, VXC_5BITOFFSET_XY(0, 1), \
+                                     VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        coord.x = floor(in_x.w); \
+        VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                                         VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, src1, input, coord.xywz, VXC_5BITOFFSET_XY(0, 1), \
+                                     VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+
+#define CROP_AND_RESIZE_BILINEAR_Quant8toQuant8(name,src_type,dst_type) \
+__kernel void crop_and_resize_bilinear_##name \
+( \
+    __read_only image2d_array_t   input, \
+    __read_only image2d_t         boxes, \
+    __read_only image2d_t         box_ind, \
+    __write_only image2d_array_t  output, \
+                 uint             ori_depth, \
+                 uint             ori_batchout \
+) \
+{ \
+    CROP_AND_RESIZE_PART0; \
+    for (d = 0; d < ori_depth; d++) \
+    { \
+        src_type src0, src1; \
+        CROP_AND_RESIZE_PART1; \
+ \
+        float4 top, bottom, value; \
+        float4 top_left4,top_right4,bottom_left4,bottom_right4; \
+        dst_type data; \
+        int4 tmpout; \
+ \
+        VXC_DP4x4(top_left4, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \
+        VXC_DP4x4(top_right4, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \
+        VXC_DP4x4(bottom_left4, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \
+        VXC_DP4x4(bottom_right4, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \
+ \
+        top = top_left4 + (top_right4 - top_left4) * x_lerp; \
+        bottom = bottom_left4 + (bottom_right4 - bottom_left4) * x_lerp; \
+        value = top + (bottom - top) * y_lerp; \
+        value = value * inOutScale + inOutTile; \
+        _viv_asm(CONV, tmpout, value); \
+        VXC_DP2x8(data, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+CROP_AND_RESIZE_BILINEAR_Quant8toQuant8(U8toU8,vxc_uchar8, vxc_uchar4)
+CROP_AND_RESIZE_BILINEAR_Quant8toQuant8(I8toI8,vxc_char8, vxc_char4)
+
+#define CROP_AND_RESIZE_BILINEAR_16Bitsto16Bits(name,src_type,dst_type,tmp_type) \
+__kernel void crop_and_resize_bilinear_##name \
+( \
+    __read_only image2d_array_t   input, \
+    __read_only image2d_t         boxes, \
+    __read_only image2d_t         box_ind, \
+    __write_only image2d_array_t  output, \
+                 uint             ori_depth, \
+                 uint             ori_batchout \
+) \
+{ \
+    CROP_AND_RESIZE_PART0; \
+    for (d = 0; d < ori_depth; d++) \
+    { \
+        vxc_short8 src0, src1; \
+        src_type src0_temp, src1_temp; \
+        CROP_AND_RESIZE_PART1; \
+ \
+        _viv_asm(COPY, src0_temp, src0, 16); \
+        _viv_asm(COPY, src1_temp, src1, 16); \
+        float4 top, bottom, value; \
+        float4 top_left4,top_right4,bottom_left4,bottom_right4; \
+        dst_type data; \
+        vxc_short4 out; \
+        tmp_type tmpout; \
+ \
+        VXC_DP4x4(top_left4, src0_temp, src0_temp, \
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \
+        VXC_DP4x4(top_right4, src0_temp, src0_temp, \
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \
+        VXC_DP4x4(bottom_left4, src1_temp, src1_temp, \
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \
+        VXC_DP4x4(bottom_right4, src1_temp, src1_temp, \
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \
+ \
+        top = top_left4 + (top_right4 - top_left4) * x_lerp; \
+        bottom = bottom_left4 + (bottom_right4 - bottom_left4) * x_lerp; \
+        value = top + (bottom - top) * y_lerp; \
+        value = value * inOutScale + inOutTile; \
+        _viv_asm(CONV, tmpout, value); \
+        VXC_DP2x8(data, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \
+        _viv_asm(COPY, out, data, 8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+CROP_AND_RESIZE_BILINEAR_16Bitsto16Bits(I16toI16, vxc_short8, vxc_short4, short4)
+CROP_AND_RESIZE_BILINEAR_16Bitsto16Bits(I16toF16, vxc_short8, vxc_half4,  half4)
+CROP_AND_RESIZE_BILINEAR_16Bitsto16Bits(F16toF16, vxc_half8,  vxc_half4,  half4)
+CROP_AND_RESIZE_BILINEAR_16Bitsto16Bits(F16toI16, vxc_half8,  vxc_short4, short4)
+
+#define CROP_AND_RESIZE_BILINEAR_F16toQuant8(name,dst_type) \
+__kernel void crop_and_resize_bilinear_F16to##name \
+( \
+    __read_only image2d_array_t   input, \
+    __read_only image2d_t         boxes, \
+    __read_only image2d_t         box_ind, \
+    __write_only image2d_array_t  output, \
+                 uint             ori_depth, \
+                 uint             ori_batchout \
+) \
+{ \
+    CROP_AND_RESIZE_PART0; \
+    for (d = 0; d < ori_depth; d++) \
+    { \
+        vxc_short8 src0, src1; \
+        vxc_half8 src0_temp, src1_temp; \
+        CROP_AND_RESIZE_PART1; \
+ \
+        _viv_asm(COPY, src0_temp, src0, 16); \
+        _viv_asm(COPY, src1_temp, src1, 16); \
+        float4 top, bottom, value; \
+        float4 top_left4,top_right4,bottom_left4,bottom_right4; \
+        dst_type data; \
+        int4 tmpout; \
+ \
+        VXC_DP4x4(top_left4, src0_temp, src0_temp, \
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \
+        VXC_DP4x4(top_right4, src0_temp, src0_temp, \
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \
+        VXC_DP4x4(bottom_left4, src1_temp, src1_temp, \
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \
+        VXC_DP4x4(bottom_right4, src1_temp, src1_temp, \
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \
+ \
+        top = top_left4 + (top_right4 - top_left4) * x_lerp; \
+        bottom = bottom_left4 + (bottom_right4 - bottom_left4) * x_lerp; \
+        value = top + (bottom - top) * y_lerp; \
+        value = value * inOutScale + inOutTile; \
+        _viv_asm(CONV, tmpout, value); \
+        VXC_DP2x8(data, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, data, \
+                                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+CROP_AND_RESIZE_BILINEAR_F16toQuant8(U8, vxc_uchar4)
+CROP_AND_RESIZE_BILINEAR_F16toQuant8(I8, vxc_char4)
+
+#define CROP_AND_RESIZE_BILINEAR_Quant8toF16(name,src_type) \
+__kernel void crop_and_resize_bilinear_##name##toF16 \
+( \
+    __read_only image2d_array_t   input, \
+    __read_only image2d_t         boxes, \
+    __read_only image2d_t         box_ind, \
+    __write_only image2d_array_t  output, \
+                 uint             ori_depth, \
+                 uint             ori_batchout \
+) \
+{ \
+    CROP_AND_RESIZE_PART0; \
+    for (d = 0; d < ori_depth; d++) \
+    { \
+        src_type src0, src1; \
+        CROP_AND_RESIZE_PART1; \
+ \
+        float4 top, bottom, value; \
+        float4 top_left4,top_right4,bottom_left4,bottom_right4; \
+        vxc_half4 data; \
+        vxc_short4 out; \
+        half4 tmpout; \
+ \
+        VXC_DP4x4(top_left4, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \
+        VXC_DP4x4(top_right4, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \
+        VXC_DP4x4(bottom_left4, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \
+        VXC_DP4x4(bottom_right4, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \
+ \
+        top = top_left4 + (top_right4 - top_left4) * x_lerp; \
+        bottom = bottom_left4 + (bottom_right4 - bottom_left4) * x_lerp; \
+        value = top + (bottom - top) * y_lerp; \
+        value = value * inOutScale + inOutTile; \
+        _viv_asm(CONV, tmpout, value); \
+        VXC_DP2x8(data, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \
+        _viv_asm(COPY, out, data, 8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+CROP_AND_RESIZE_BILINEAR_Quant8toF16(U8, vxc_uchar8)
+CROP_AND_RESIZE_BILINEAR_Quant8toF16(I8, vxc_char8)
+
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/crop_and_resize_nearest_neighbor.vx b/src/tim/vx/internal/src/libnnext/ops/vx/crop_and_resize_nearest_neighbor.vx
new file mode 100644
index 0000000..b67890f
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/crop_and_resize_nearest_neighbor.vx
@@ -0,0 +1,292 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float inOutScale;
+_viv_uniform float inOutTile;
+_viv_uniform float width_scale;
+_viv_uniform float height_scale;
+_viv_uniform int   image_width;
+_viv_uniform int   image_height;
+_viv_uniform VXC_512Bits uniConvertFstToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvertSecToFp32_4x4;
+_viv_uniform VXC_512Bits uniExtract8Bit_2x8;
+_viv_uniform VXC_512Bits uniExtractHalf8_2x8;
+
+#define IMG_LOAD(src_type) \
+        src_type src; \
+        int4 coord = (int4)(0, in_y, d + b * ori_depth, 0); \
+        int8 input_desc, output_desc; \
+ \
+        coord_out.z = d + coord_out.z * ori_depth; \
+ \
+        _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+        int baseAddr = (int)coord.z * input_desc.s4 + input_desc.s0; \
+        _viv_asm(MOV, coord.w, baseAddr); \
+ \
+        _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+        baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+        _viv_asm(MOV, coord_out.w, baseAddr); \
+ \
+        in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x) * _width_scale)); \
+        coord.x = in_x; \
+        VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                                         VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+        in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 1) * _width_scale)); \
+        coord.x = in_x; \
+        VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                                         VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+        in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 2) * _width_scale)); \
+        coord.x = in_x; \
+        VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                                         VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+        in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 3) * _width_scale)); \
+        coord.x = in_x; \
+        VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                                         VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+        in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 4) * _width_scale)); \
+        coord.x = in_x; \
+        VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                                         VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \
+        in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 5) * _width_scale)); \
+        coord.x = in_x; \
+        VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                                         VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \
+        in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 6) * _width_scale)); \
+        coord.x = in_x; \
+        VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                                         VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \
+        in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 7) * _width_scale)); \
+        coord.x = in_x; \
+        VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                                         VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \
+
+#define CROP_AND_RESIZE_Quant8toQuant8(name, data_type) \
+__kernel void crop_and_resize_nearest_neighbor_##name \
+( \
+    __read_only image2d_array_t   input, \
+    __read_only image2d_t         boxes, \
+    __read_only image2d_t         box_ind, \
+    __write_only image2d_array_t  output, \
+                 uint             ori_depth, \
+                 uint             ori_batchout \
+) \
+{ \
+    int bb = get_global_id(2); \
+    int y =  get_global_id(1); \
+    int x = get_global_id(0); \
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    int2 coord_box_ind = (int2)(bb, 0); \
+    int b = read_imagei(box_ind, coord_box_ind).x; \
+    float4 xy; \
+    int in_x, in_y; \
+    int d = 0; \
+    Image img_boxes = create_image_from_image2d(boxes, 2); \
+    __global half* boxes_ptr = (__global half*)img_boxes.ptr; \
+    xy = vload_half4(bb, boxes_ptr); \
+    float _width_scale = convert_float(xy.w - xy.y) * width_scale; \
+    float _height_scale = convert_float(xy.z - xy.x) * height_scale; \
+ \
+    if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \
+    if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \
+    in_y = convert_int(round(xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale)); \
+ \
+    for (d = 0; d < ori_depth; d++) \
+    { \
+        data_type data; \
+        int4 tmpout0, tmpout1; \
+        float4 tmpdata0, tmpdata1; \
+        IMG_LOAD(data_type); \
+ \
+        VXC_DP4x4(tmpdata0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \
+        VXC_DP4x4(tmpdata1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \
+ \
+        tmpdata0 = tmpdata0 * inOutScale + inOutTile; \
+        tmpdata1 = tmpdata1 * inOutScale + inOutTile; \
+        _viv_asm(CONV, tmpout0, tmpdata0); \
+        _viv_asm(CONV, tmpout1, tmpdata1); \
+ \
+        VXC_DP2x8(data, tmpout0, tmpout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \
+ \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
+                                     data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+CROP_AND_RESIZE_Quant8toQuant8(U8toU8, vxc_uchar8)
+CROP_AND_RESIZE_Quant8toQuant8(I8toI8, vxc_char8)
+
+#define CROP_AND_RESIZE_Quant8toF16(name, src_type) \
+__kernel void crop_and_resize_nearest_neighbor_##name##toF16 \
+( \
+    __read_only image2d_array_t   input, \
+    __read_only image2d_t         boxes, \
+    __read_only image2d_t         box_ind, \
+    __write_only image2d_array_t  output, \
+                 uint             ori_depth, \
+                 uint             ori_batchout \
+) \
+{ \
+    int bb = get_global_id(2); \
+    int y =  get_global_id(1); \
+    int x = get_global_id(0); \
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    int2 coord_box_ind = (int2)(bb, 0); \
+    int b = read_imagei(box_ind, coord_box_ind).x; \
+    float4 xy; \
+    int in_x, in_y; \
+    int d = 0; \
+    Image img_boxes = create_image_from_image2d(boxes, 2); \
+    __global half* boxes_ptr = (__global half*)img_boxes.ptr; \
+    xy = vload_half4(bb, boxes_ptr); \
+    float _width_scale = convert_float(xy.w - xy.y) * width_scale; \
+    float _height_scale = convert_float(xy.z - xy.x) * height_scale; \
+ \
+    if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \
+    if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \
+    in_y = convert_int(round(xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale)); \
+ \
+    for (d = 0; d < ori_depth; d++) \
+    { \
+        vxc_short8 out; \
+        vxc_half8 data; \
+        half4 tmpout0, tmpout1; \
+        float4 tmpdata0, tmpdata1; \
+        IMG_LOAD(src_type); \
+ \
+        VXC_DP4x4(tmpdata0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \
+        VXC_DP4x4(tmpdata1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \
+ \
+        tmpdata0 = tmpdata0 * inOutScale + inOutTile; \
+        tmpdata1 = tmpdata1 * inOutScale + inOutTile; \
+        _viv_asm(CONV, tmpout0, tmpdata0); \
+        _viv_asm(CONV, tmpout1, tmpdata1); \
+ \
+        VXC_DP2x8(data, tmpout0, tmpout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \
+        _viv_asm(COPY, out, data, 16); \
+ \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, \
+                                     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+CROP_AND_RESIZE_Quant8toF16(U8, vxc_uchar8)
+CROP_AND_RESIZE_Quant8toF16(I8, vxc_char8)
+
+#define CROP_AND_RESIZE_NEAREST_F16toQuant8(name, dst_type) \
+__kernel void crop_and_resize_nearest_neighbor_F16to##name \
+( \
+    __read_only image2d_array_t   input, \
+    __read_only image2d_t         boxes, \
+    __read_only image2d_t         box_ind, \
+    __write_only image2d_array_t  output, \
+                 uint             ori_depth, \
+                 uint             ori_batchout \
+) \
+{ \
+    int bb = get_global_id(2); \
+    int y =  get_global_id(1); \
+    int x = get_global_id(0); \
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    int2 coord_box_ind = (int2)(bb, 0); \
+    int b = read_imagei(box_ind, coord_box_ind).x; \
+    float4 xy; \
+    int in_x, in_y; \
+    int d = 0; \
+    Image img_boxes = create_image_from_image2d(boxes, 2); \
+    __global half* boxes_ptr = (__global half*)img_boxes.ptr; \
+    xy = vload_half4(bb, boxes_ptr); \
+    float _width_scale = convert_float(xy.w - xy.y) * width_scale; \
+    float _height_scale = convert_float(xy.z - xy.x) * height_scale; \
+ \
+    if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \
+    if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \
+    in_y = convert_int(round(xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale)); \
+ \
+    for (d = 0; d < ori_depth; d++) \
+    { \
+        dst_type data; \
+        int4 tmpout0, tmpout1; \
+        float4 tmpdata0, tmpdata1; \
+        IMG_LOAD(vxc_short8); \
+        vxc_half8 src_half; \
+        _viv_asm(COPY, src_half, src, 16); \
+ \
+        VXC_DP4x4(tmpdata0, src_half, src_half, \
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \
+        VXC_DP4x4(tmpdata1, src_half, src_half, \
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \
+ \
+        tmpdata0 = tmpdata0 * inOutScale + inOutTile; \
+        tmpdata1 = tmpdata1 * inOutScale + inOutTile; \
+        _viv_asm(CONV, tmpout0, tmpdata0); \
+        _viv_asm(CONV, tmpout1, tmpdata1); \
+ \
+        VXC_DP2x8(data, tmpout0, tmpout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \
+ \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+CROP_AND_RESIZE_NEAREST_F16toQuant8(U8, vxc_uchar8)
+CROP_AND_RESIZE_NEAREST_F16toQuant8(I8, vxc_char8)
+
+#define CROP_AND_RESIZE_16Bitsto16Bits(name,src_type,dst_type,temp_type) \
+__kernel void crop_and_resize_nearest_neighbor_##name \
+( \
+    __read_only image2d_array_t   input, \
+    __read_only image2d_t         boxes, \
+    __read_only image2d_t         box_ind, \
+    __write_only image2d_array_t  output, \
+                 uint             ori_depth, \
+                 uint             ori_batchout \
+) \
+{ \
+    int bb = get_global_id(2); \
+    int y =  get_global_id(1); \
+    int x = get_global_id(0); \
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    int2 coord_box_ind = (int2)(bb, 0); \
+    int b = read_imagei(box_ind, coord_box_ind).x; \
+    float4 xy; \
+    int in_x, in_y; \
+    int d = 0; \
+    Image img_boxes = create_image_from_image2d(boxes, 2); \
+    __global half* boxes_ptr = (__global half*)img_boxes.ptr; \
+    xy = vload_half4(bb, boxes_ptr); \
+    float _width_scale = convert_float(xy.w - xy.y) * width_scale; \
+    float _height_scale = convert_float(xy.z - xy.x) * height_scale; \
+ \
+    if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \
+    if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \
+    in_y = convert_int(round(xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale)); \
+ \
+    for (d = 0; d < ori_depth; d++) \
+    { \
+        vxc_short8 out; \
+        dst_type data; \
+        temp_type tmpout0, tmpout1; \
+        float4 tmpdata0, tmpdata1; \
+        IMG_LOAD(vxc_short8); \
+        src_type src_temp; \
+        _viv_asm(COPY, src_temp, src, 16); \
+ \
+        VXC_DP4x4(tmpdata0, src_temp, src_temp, \
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \
+        VXC_DP4x4(tmpdata1, src_temp, src_temp, \
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \
+ \
+        _viv_asm(CONV, tmpout0, tmpdata0); \
+        _viv_asm(CONV, tmpout1, tmpdata1); \
+ \
+        VXC_DP2x8(data, tmpout0, tmpout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \
+        _viv_asm(COPY, out, data, 16); \
+ \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+}
+CROP_AND_RESIZE_16Bitsto16Bits \
+(F16toF16, vxc_half8, vxc_half8,  half4)
+CROP_AND_RESIZE_16Bitsto16Bits \
+(F16toI16, vxc_half8, vxc_short8, short4)
+CROP_AND_RESIZE_16Bitsto16Bits \
+(I16toF16, vxc_short8, vxc_half8, half4)
+CROP_AND_RESIZE_16Bitsto16Bits \
+(I16toI16, vxc_short8, vxc_short8,short4)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_1.vx
index 9a6a9fe..00d13ce 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d_1.vx
@@ -81,6 +81,11 @@ float4 eltwise_unary_acosh(float4 val)
     return acosh(val);
 }
 
+float4 eltwise_unary_tan(float4 val)
+{
+    return native_tan(val);
+}
+
 _viv_uniform float inputScale;
 _viv_uniform float inputTail;
 _viv_uniform float outputScale;
@@ -198,4 +203,5 @@ ADD_ELTSISE_UNARY_2D(atan)
 ADD_ELTSISE_UNARY_2D(atanh)
 //ACOSH
 ADD_ELTSISE_UNARY_2D(acosh)
-
+//TAN
+ADD_ELTSISE_UNARY_2D(tan)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_1.vx
index f53c3ff..f8de5fa 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d_1.vx
@@ -81,6 +81,11 @@ float4 eltwise_unary_acosh(float4 val)
     return acosh(val);
 }
 
+float4 eltwise_unary_tan(float4 val)
+{
+    return native_tan(val);
+}
+
 _viv_uniform float inputScale;
 _viv_uniform float inputTail;
 _viv_uniform float outputScale;
@@ -197,3 +202,5 @@ ADD_ELTSISE_UNARY_3D(atan)
 ADD_ELTSISE_UNARY_3D(atanh)
 //ACOSH
 ADD_ELTSISE_UNARY_3D(acosh)
+//TAN
+ADD_ELTSISE_UNARY_3D(tan)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx
index 73171a8..7ba4fc1 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather.vx
@@ -9,7 +9,8 @@ __kernel void gather_I8toI8(
     __write_only image2d_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int gidx = get_global_id(0);  // block_size
@@ -34,7 +35,8 @@ __kernel void gather_U8toU8(
     __write_only image2d_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int gidx = get_global_id(0);  // block_size
@@ -59,7 +61,8 @@ __kernel void gather_I16toI16(
     __write_only image2d_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int gidx = get_global_id(0);  // block_size
@@ -85,7 +88,8 @@ __kernel void gather_F16toF16(
     __write_only image2d_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int gidx = get_global_id(0);  // block_size
@@ -110,7 +114,8 @@ __kernel void gather_I8toI8_axis0(
     __write_only image2d_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int2 coord = (int2)(get_global_id(0), get_global_id(1));
@@ -137,7 +142,8 @@ __kernel void gather_U8toU8_axis0(
     __write_only image2d_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int2 coord = (int2)(get_global_id(0), get_global_id(1));
@@ -164,7 +170,8 @@ __kernel void gather_I16toI16_axis0(
     __write_only image2d_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int2 coord = (int2)(get_global_id(0), get_global_id(1));
@@ -191,7 +198,8 @@ __kernel void gather_F16toF16_axis0(
     __write_only image2d_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int2 coord = (int2)(get_global_id(0), get_global_id(1));
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx
index b2009bf..b7729d0 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_array.vx
@@ -11,7 +11,8 @@ __kernel void gather_I8toI8_array(
     __write_only image2d_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int gidx = get_global_id(0);  // block_size
@@ -25,13 +26,29 @@ __kernel void gather_I8toI8_array(
 
     Image img1 = create_image_from_image2d(input0, 1);
     Image img2 = create_image_from_image2d(output, 1);
-    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);
-    __global vxc_char16* data_ptr = (__global vxc_char16*)input_ptr;
-    vxc_char16 src = data_ptr[0];
+
     int2 coord = (int2)(gidx, gidz * indices_num + gidy);
+
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);
     uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
-    __global vxc_char16* dst_ptr = (__global vxc_char16*)output_ptr;
-    dst_ptr[0] = src;
+
+    if (gidx == ((block_size >> 4) * 16))
+    {
+        __global char* data_ptr = (__global char*)input_ptr;
+        __global char* dst_ptr = (__global char*)output_ptr;
+        int i = 0;
+        for (i = 0; i < block_size - gidx; i ++)
+        {
+           dst_ptr[i] = data_ptr[i];
+        }
+    }
+    else
+    {
+        __global vxc_char16* data_ptr = (__global vxc_char16*)input_ptr;
+        vxc_char16 src = data_ptr[0];
+        __global vxc_char16* dst_ptr = (__global vxc_char16*)output_ptr;
+        dst_ptr[0] = src;
+    }
 }
 
 __kernel void gather_U8toU8_array(
@@ -40,7 +57,8 @@ __kernel void gather_U8toU8_array(
     __write_only image2d_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int gidx = get_global_id(0);  // block_size
@@ -54,13 +72,29 @@ __kernel void gather_U8toU8_array(
 
     Image img1 = create_image_from_image2d(input0, 1);
     Image img2 = create_image_from_image2d(output, 1);
-    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);
-    __global vxc_uchar16* data_ptr = (__global vxc_uchar16*)input_ptr;
-    vxc_uchar16 src = data_ptr[0];
+
     int2 coord = (int2)(gidx, gidz * indices_num + gidy);
+
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);
     uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
-    __global vxc_uchar16* dst_ptr = (__global vxc_uchar16*)output_ptr;
-    dst_ptr[0] = src;
+
+    if (gidx == ((block_size >> 4) * 16))
+    {
+        __global uchar* data_ptr = (__global uchar*)input_ptr;
+        __global uchar* dst_ptr = (__global uchar*)output_ptr;
+        int i = 0;
+        for (i = 0; i < block_size - gidx; i ++)
+        {
+           dst_ptr[i] = data_ptr[i];
+        }
+    }
+    else
+    {
+        __global vxc_uchar16* data_ptr = (__global vxc_uchar16*)input_ptr;
+        vxc_uchar16 src = data_ptr[0];
+        __global vxc_uchar16* dst_ptr = (__global vxc_uchar16*)output_ptr;
+        dst_ptr[0] = src;
+    }
 }
 
 __kernel void gather_I16toI16_array(
@@ -69,7 +103,8 @@ __kernel void gather_I16toI16_array(
     __write_only image2d_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int gidx = get_global_id(0);  // block_size
@@ -84,13 +119,29 @@ __kernel void gather_I16toI16_array(
 
     Image img1 = create_image_from_image2d(input0, 2);
     Image img2 = create_image_from_image2d(output, 2);
-    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);
-    __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr;
-    vxc_short8 src = data_ptr[0];
+
     int2 coord = (int2)(gidx, gidz * indices_num + gidy);
+
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);
     uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
-    __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr;
-    dst_ptr[0] = src;
+
+    if (gidx == ((block_size >> 3) * 8))
+    {
+        __global short* data_ptr = (__global short*)input_ptr;
+        __global short* dst_ptr = (__global short*)output_ptr;
+        int i = 0;
+        for (i = 0; i < block_size - gidx; i ++)
+        {
+           dst_ptr[i] = data_ptr[i];
+        }
+    }
+    else
+    {
+        __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr;
+        vxc_short8 src = data_ptr[0];
+        __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr;
+        dst_ptr[0] = src;
+    }
 }
 
 __kernel void gather_F16toF16_array(
@@ -99,7 +150,8 @@ __kernel void gather_F16toF16_array(
     __write_only image2d_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int gidx = get_global_id(0);  // block_size
@@ -114,13 +166,29 @@ __kernel void gather_F16toF16_array(
 
     Image img1 = create_image_from_image2d(input0, 2);
     Image img2 = create_image_from_image2d(output, 2);
-    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);
-    __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr;
-    vxc_short8 src = data_ptr[0];
+
     int2 coord = (int2)(gidx, gidz * indices_num + gidy);
+
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);
     uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
-    __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr;
-    dst_ptr[0] = src;
+
+    if (gidx == ((block_size >> 3) * 8))
+    {
+        __global short* data_ptr = (__global short*)input_ptr;
+        __global short* dst_ptr = (__global short*)output_ptr;
+        int i = 0;
+        for (i = 0; i < block_size - gidx; i ++)
+        {
+           dst_ptr[i] = data_ptr[i];
+        }
+    }
+    else
+    {
+        __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr;
+        vxc_short8 src = data_ptr[0];
+        __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr;
+        dst_ptr[0] = src;
+    }
 }
 
 #define GATHER_AXIS0_ARRAY(src0_type_name, read_type, data_type, write_type) \
@@ -130,7 +198,8 @@ __kernel void gather_##src0_type_name##to##src0_type_name##_axis0_array( \
     __write_only image2d_t  output, \
     int block_size, \
     int block_num, \
-    int axis_num \
+    int axis_num, \
+    int is_array \
     ) \
 { \
     int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx
index 47f1db6..4bc39f0 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_batch.vx
@@ -10,7 +10,8 @@ __kernel void gather_batch_I8toI8(
     __write_only image2d_array_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int gidx = get_global_id(0);  // block_size
@@ -41,7 +42,8 @@ __kernel void gather_batch_U8toU8(
     __write_only image2d_array_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int gidx = get_global_id(0);  // block_size
@@ -72,7 +74,8 @@ __kernel void gather_batch_I16toI16(
     __write_only image2d_array_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int gidx = get_global_id(0);  // block_size
@@ -103,7 +106,8 @@ __kernel void gather_batch_F16toF16(
     __write_only image2d_array_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int gidx = get_global_id(0);  // block_size
@@ -134,7 +138,8 @@ __kernel void gather_batch_I8toI8_axis0(
     __write_only image2d_array_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
@@ -163,7 +168,8 @@ __kernel void gather_batch_U8toU8_axis0(
     __write_only image2d_array_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
@@ -192,7 +198,8 @@ __kernel void gather_batch_I16toI16_axis0(
     __write_only image2d_array_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
@@ -221,7 +228,8 @@ __kernel void gather_batch_F16toF16_axis0(
     __write_only image2d_array_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx
index 87825fd..bbe29e7 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix.vx
@@ -15,7 +15,8 @@ __kernel void gather_##src0_type_name##toF16( \
     __write_only image2d_t  output, \
     int block_size, \
     int block_num, \
-    int axis_num \
+    int axis_num, \
+    int is_array \
     ) \
 { \
     int gidx = get_global_id(0); \
@@ -52,7 +53,8 @@ __kernel void gather_F16to##src1_type_name( \
     __write_only image2d_t  output, \
     int block_size, \
     int block_num, \
-    int axis_num \
+    int axis_num, \
+    int is_array \
     ) \
 { \
     int gidx = get_global_id(0); \
@@ -85,7 +87,8 @@ __kernel void gather_I16toF16(
     __write_only image2d_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int gidx = get_global_id(0);
@@ -120,7 +123,8 @@ __kernel void gather_##src0_type_name##toF16_axis0( \
     __write_only image2d_t  output, \
     int block_size, \
     int block_num, \
-    int axis_num \
+    int axis_num, \
+    int is_array \
     ) \
 { \
     int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
@@ -152,7 +156,8 @@ __kernel void gather_F16to##src1_type_name##_axis0( \
     __write_only image2d_t  output, \
     int block_size, \
     int block_num, \
-    int axis_num \
+    int axis_num, \
+    int is_array \
     ) \
 { \
     int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
@@ -184,7 +189,8 @@ __kernel void gather_I16toF16_axis0(
     __write_only image2d_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int2 coord = (int2)(get_global_id(0), get_global_id(1));
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx
index 988c811..e68d0a1 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_mix_batch.vx
@@ -16,7 +16,8 @@ __kernel void gather_batch_##src0_type_name##toF16( \
     __write_only image2d_t  output, \
     int block_size, \
     int block_num, \
-    int axis_num \
+    int axis_num, \
+    int is_array \
     ) \
 { \
     int gidx = get_global_id(0); \
@@ -63,7 +64,8 @@ __kernel void gather_batch_F16to##src1_type_name( \
     __write_only image2d_t  output, \
     int block_size, \
     int block_num, \
-    int axis_num \
+    int axis_num, \
+    int is_array \
     ) \
 { \
     int gidx = get_global_id(0); \
@@ -104,7 +106,8 @@ __kernel void gather_batch_I16toF16(
     __write_only image2d_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int gidx = get_global_id(0);
@@ -143,7 +146,8 @@ __kernel void gather_batch_##src0_type_name##toF16_axis0( \
     __write_only image2d_t  output, \
     int block_size, \
     int block_num, \
-    int axis_num \
+    int axis_num, \
+    int is_array \
     ) \
 { \
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
@@ -178,7 +182,8 @@ __kernel void gather_batch_F16to##src1_type_name##_axis0( \
     __write_only image2d_t  output, \
     int block_size, \
     int block_num, \
-    int axis_num \
+    int axis_num, \
+    int is_array \
     ) \
 { \
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
@@ -213,7 +218,8 @@ __kernel void gather_batch_I16toF16_axis0(
     __write_only image2d_t  output,
     int block_size,
     int block_num,
-    int axis_num
+    int axis_num,
+    int is_array
     )
 {
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx
index 6d117ed..00f6511 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx
@@ -6,6 +6,9 @@
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
 _viv_uniform VXC_512Bits uniExtractOddData_2x8;
 
+_viv_uniform float output_scale1;
+_viv_uniform float output_zp1;
+
 float4 sigmoid_func(float4 x)
 {
     x *= -logE;
@@ -117,13 +120,15 @@ __kernel void grucell_activation_z_h_##name0##_F16to##name1##_##act_name( \
     VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \
     h_tm = h_tm * hstate_in_scale + hstate_in_tail; \
     float4 result = (1 - z) * h + z * h_tm; \
-    result = result * output_scale + output_zp; \
-    int4 dst0; \
-    _viv_asm(CONV_RTE, dst0, result); \
+    float4 out0 = result * output_scale + output_zp; \
+    float4 out1 = result * output_scale1 + output_zp1; \
+    int4 dst0, dst1; \
+    _viv_asm(CONV_RTE, dst0, out0); \
+    _viv_asm(CONV_RTE, dst1, out1); \
     dst_type dst; \
-    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
     VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
-    VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(hstate_out, coord_in, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
 }
 GRUCELL_QNT_F16TO_QNT(U8,  U8,  SIGMOID,  sigmoid_func, vxc_uchar8, vxc_uchar8)
 GRUCELL_QNT_F16TO_QNT(I8,  I8,  SIGMOID,  sigmoid_func, vxc_char8,  vxc_char8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx
index dbd265a..854bc1e 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx
@@ -25,6 +25,11 @@ float4 tanh_func(float4 x)
     x = 1.0f / x;
     return 2 * x - 1;
 }
+float4 relu_func(float4 x)
+{
+    x = x > 0 ? x : 0;
+    return x;
+}
 
 _viv_uniform VXC_512Bits uniF16PlusF16_0_4x4;
 _viv_uniform VXC_512Bits uniF16PlusF16_1_4x4;
@@ -88,6 +93,8 @@ __kernel void grucell_reset_after_activation_F16_F16toF16_##act_name##_##rec_act
 }
 GRUCELL_F16_F16TOF16(TANH,    tanh_func,    SIGMOID, sigmoid_func)
 GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func, SIGMOID, sigmoid_func)
+GRUCELL_F16_F16TOF16(RELU,    relu_func,    SIGMOID, sigmoid_func)
+
 
 _viv_uniform float hstate_in_scale;
 _viv_uniform float hstate_in_tail;
@@ -153,6 +160,10 @@ GRUCELL_QNT_F16TO_QNT(I16_F16toI16_TANH_SIGMOID,    tanh_func,    sigmoid_func,
 GRUCELL_QNT_F16TO_QNT(U8_F16toU8_SIGMOID_SIGMOID,   sigmoid_func, sigmoid_func, vxc_uchar8, vxc_uchar8)
 GRUCELL_QNT_F16TO_QNT(I8_F16toI8_SIGMOID_SIGMOID,   sigmoid_func, sigmoid_func, vxc_char8,  vxc_char8)
 GRUCELL_QNT_F16TO_QNT(I16_F16toI16_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_short8, vxc_short8)
+GRUCELL_QNT_F16TO_QNT(U8_F16toU8_RELU_SIGMOID,      relu_func,    sigmoid_func, vxc_uchar8, vxc_uchar8)
+GRUCELL_QNT_F16TO_QNT(I8_F16toI8_RELU_SIGMOID,      relu_func,    sigmoid_func, vxc_char8,  vxc_char8)
+GRUCELL_QNT_F16TO_QNT(I16_F16toI16_RELU_SIGMOID,    relu_func,    sigmoid_func, vxc_short8, vxc_short8)
+
 
 #define GRUCELL_BF16(act_name, act_func, rec_act_name, rec_act_func) \
 __kernel void grucell_reset_after_activation_BF16_BF16toBF16_##act_name##_##rec_act_name( \
@@ -215,3 +226,4 @@ __kernel void grucell_reset_after_activation_BF16_BF16toBF16_##act_name##_##rec_
 }
 GRUCELL_BF16(TANH,    tanh_func,    SIGMOID, sigmoid_func)
 GRUCELL_BF16(SIGMOID, sigmoid_func, SIGMOID, sigmoid_func)
+GRUCELL_BF16(RELU,    relu_func,    SIGMOID, sigmoid_func)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_0.vx
new file mode 100644
index 0000000..9046891
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_0.vx
@@ -0,0 +1,315 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform int height;
+_viv_uniform uint group_num;
+_viv_uniform float output_zp;
+_viv_uniform float output_scale;
+_viv_uniform float inv_multiplier;
+
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_U8_F16toF16(
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps)
+{
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
+    int2 coord_sum = (int2)(0, gidz);
+    int4 coord_para = coord;
+    vxc_uchar16 src0;
+    vxc_short8 src1, outval;
+    vxc_half8 scale_h, dst;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_sum);
+        coord_sum.x += 4;
+    }
+    mean_vari *= inv_multiplier;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+    coord_para.z = 0;
+    coord_para.w = 0;
+    int4 coord_bias = coord_para;
+
+    int8 input_desc, scale_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.z, baseAddr_a);
+
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
+    _viv_asm(MOV, coord_para.w, baseAddr_c);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr);
+
+    vxc_float4  tmpData0, tmpData1, norm;
+    half4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord_in, 0,
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.y ++;
+        coord_para.y = coord.y; coord_bias.y = coord.y;
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_0_4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_1_4x4);
+        VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_0_4x4);
+        VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_1_4x4);
+        tmpData0 = tmpData0 - mean_vari.s0;
+        tmpData1 = tmpData1 - mean_vari.s0;
+
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
+        _viv_asm(CONV, tmpVal0, norm);
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
+        _viv_asm(CONV, tmpVal1, norm);
+
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniExtract8Data_2x8);
+        _viv_asm(COPY, outval, dst, 16);
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_U8_F32toF16(
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps)
+{
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
+    int2 coord_sum = (int2)(0, gidz);
+    int4 coord_para = coord;
+    vxc_uchar16 src0;
+    vxc_short8 outval;
+    vxc_half8 scale_h, dst;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_sum);
+        coord_sum.x += 4;
+    }
+    mean_vari *= inv_multiplier;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+    coord_para.z = 0;
+    coord_para.w = 0;
+    int4 coord_bias = coord_para;
+
+    int8 input_desc, scale_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.z, baseAddr_a);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr);
+
+    vxc_float4  tmpData0, tmpData1, norm;
+    half4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord_in, 0,
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.y ++;
+        coord_bias.y = coord.y;
+
+        bias_f0 = read_imagef(bias, coord_bias);
+        scale_f0 = read_imagef(scale, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        scale_f1 = read_imagef(scale, coord_bias);
+        coord_bias.x = coord.x;
+
+        VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_0_4x4);
+        VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_1_4x4);
+        tmpData0 = tmpData0 - mean_vari.s0;
+        tmpData1 = tmpData1 - mean_vari.s0;
+
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
+        _viv_asm(CONV, tmpVal0, norm);
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
+        _viv_asm(CONV, tmpVal1, norm);
+
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniExtract8Data_2x8);
+        _viv_asm(COPY, outval, dst, 16);
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_U8_F16toU8(
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps)
+{
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
+    int2 coord_sum = (int2)(0, gidz);
+    int4 coord_para = coord;
+    vxc_uchar16 src0, outval;
+    vxc_short8 src1;
+    vxc_half8 scale_h;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_sum);
+        coord_sum.x += 4;
+    }
+    mean_vari *= inv_multiplier;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+    coord_para.z = 0;
+    coord_para.w = 0;
+    int4 coord_bias = coord_para;
+
+    int8 input_desc, scale_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.z, baseAddr_a);
+
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
+    _viv_asm(MOV, coord_para.w, baseAddr_c);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr);
+
+    vxc_float4  tmpData0, tmpData1, norm;
+    vxc_int4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord_in, 0,
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.y ++;
+        coord_para.y = coord.y; coord_bias.y = coord.y;
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_0_4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_1_4x4);
+
+        VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_0_4x4);
+        VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_1_4x4);
+        tmpData0 = tmpData0 - mean_vari.s0;
+        tmpData1 = tmpData1 - mean_vari.s0;
+
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp);
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp);
+
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniExtract8Data_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_U8_F32toU8(
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps)
+{
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
+    int2 coord_sum = (int2)(0, gidz);
+    int4 coord_para = coord;
+    vxc_uchar16 src0 , outval;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_sum);
+        coord_sum.x += 4;
+    }
+    mean_vari *= inv_multiplier;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+    coord_para.z = 0;
+    coord_para.w = 0;
+    int4 coord_bias = coord_para;
+
+    int8 input_desc, scale_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.z, baseAddr_a);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr);
+
+    vxc_float4  tmpData0, tmpData1, norm;
+    vxc_int4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord_in, 0,
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.y ++;
+        coord_bias.y = coord.y;
+
+        bias_f0 = read_imagef(bias, coord_bias);
+        scale_f0 = read_imagef(scale, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        scale_f1 = read_imagef(scale, coord_bias);
+        coord_bias.x = coord.x;
+
+        VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_0_4x4);
+        VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_1_4x4);
+        tmpData0 = tmpData0 - mean_vari.s0;
+        tmpData1 = tmpData1 - mean_vari.s0;
+
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp);
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp);
+
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniExtract8Data_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_1.vx
new file mode 100644
index 0000000..b247b2f
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_1.vx
@@ -0,0 +1,317 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform int height;
+_viv_uniform uint group_num;
+_viv_uniform float output_zp;
+_viv_uniform float output_scale;
+_viv_uniform float inv_multiplier;
+
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I8_F16toF16(
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps)
+{
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
+    int2 coord_sum = (int2)(0, gidz);
+    int4 coord_para = coord;
+    vxc_char16 src0;
+    vxc_short8 src1, outval;
+    vxc_half8 scale_h, dst;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_sum);
+        coord_sum.x += 4;
+    }
+    mean_vari *= inv_multiplier;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+    coord_para.z = 0;
+    coord_para.w = 0;
+    int4 coord_bias = coord_para;
+
+    int8 input_desc, scale_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.z, baseAddr_a);
+
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
+    _viv_asm(MOV, coord_para.w, baseAddr_c);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr);
+
+    vxc_float4  tmpData0, tmpData1, norm;
+    half4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord_in, 0,
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.y ++;
+        coord_para.y = coord.y; coord_bias.y = coord.y;
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_0_4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_1_4x4);
+        VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_0_4x4);
+        VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_1_4x4);
+        tmpData0 = tmpData0 - mean_vari.s0;
+        tmpData1 = tmpData1 - mean_vari.s0;
+
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
+        _viv_asm(CONV, tmpVal0, norm);
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
+        _viv_asm(CONV, tmpVal1, norm);
+
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniExtract8Data_2x8);
+        _viv_asm(COPY, outval, dst, 16);
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I8_F32toF16(
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps)
+{
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
+    int2 coord_sum = (int2)(0, gidz);
+    int4 coord_para = coord;
+    vxc_char16 src0;
+    vxc_short8 outval;
+    vxc_half8 scale_h, dst;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_sum);
+        coord_sum.x += 4;
+    }
+    mean_vari *= inv_multiplier;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+    coord_para.z = 0;
+    coord_para.w = 0;
+    int4 coord_bias = coord_para;
+
+    int8 input_desc, scale_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.z, baseAddr_a);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr);
+
+    vxc_float4  tmpData0, tmpData1, norm;
+    half4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord_in, 0,
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.y ++;
+        coord_bias.y = coord.y;
+
+        bias_f0 = read_imagef(bias, coord_bias);
+        scale_f0 = read_imagef(scale, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        scale_f1 = read_imagef(scale, coord_bias);
+        coord_bias.x = coord.x;
+
+        VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_0_4x4);
+        VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_1_4x4);
+        tmpData0 = tmpData0 - mean_vari.s0;
+        tmpData1 = tmpData1 - mean_vari.s0;
+
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
+        _viv_asm(CONV, tmpVal0, norm);
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
+        _viv_asm(CONV, tmpVal1, norm);
+
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniExtract8Data_2x8);
+        _viv_asm(COPY, outval, dst, 16);
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I8_F16toU8(
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps)
+{
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
+    int2 coord_sum = (int2)(0, gidz);
+    int4 coord_para = coord;
+    vxc_char16 src0;
+    vxc_uchar16 outval;
+    vxc_short8 src1;
+    vxc_half8 scale_h;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_sum);
+        coord_sum.x += 4;
+    }
+    mean_vari *= inv_multiplier;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+    coord_para.z = 0;
+    coord_para.w = 0;
+    int4 coord_bias = coord_para;
+
+    int8 input_desc, scale_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.z, baseAddr_a);
+
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;
+    _viv_asm(MOV, coord_para.w, baseAddr_c);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr);
+
+    vxc_float4  tmpData0, tmpData1, norm;
+    vxc_int4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord_in, 0,
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.y ++;
+        coord_para.y = coord.y; coord_bias.y = coord.y;
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        bias_f0 = read_imagef(bias, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        coord_bias.x = coord.x;
+
+        _viv_asm(COPY, scale_h, src1, 16);
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_0_4x4);
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_1_4x4);
+
+        VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_0_4x4);
+        VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_1_4x4);
+        tmpData0 = tmpData0 - mean_vari.s0;
+        tmpData1 = tmpData1 - mean_vari.s0;
+
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp);
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp);
+
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniExtract8Data_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I8_F32toU8(
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps)
+{
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
+    int2 coord_sum = (int2)(0, gidz);
+    int4 coord_para = coord;
+    vxc_char16 src0;
+    vxc_uchar16 outval;
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;
+    vxc_float4 mean_vari = (vxc_float4)(0);
+
+    for(int i = 0; i < group_num; i++)
+    {
+        mean_vari += read_imagef(meanVari, coord_sum);
+        coord_sum.x += 4;
+    }
+    mean_vari *= inv_multiplier;
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
+    mean_vari.s1 = rsqrt(mean_vari.s1);
+    coord_para.z = 0;
+    coord_para.w = 0;
+    int4 coord_bias = coord_para;
+
+    int8 input_desc, scale_desc, output_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.z, baseAddr_a);
+
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord.z, baseAddr);
+
+    vxc_float4  tmpData0, tmpData1, norm;
+    vxc_int4 tmpVal0, tmpVal1;
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        VXC_OP4(img_load_3d, src0, input, coord_in, 0,
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.y ++;
+        coord_bias.y = coord.y;
+
+        bias_f0 = read_imagef(bias, coord_bias);
+        scale_f0 = read_imagef(scale, coord_bias);
+        coord_bias.x += 4;
+        bias_f1 = read_imagef(bias, coord_bias);
+        scale_f1 = read_imagef(scale, coord_bias);
+        coord_bias.x = coord.x;
+
+        VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_0_4x4);
+        VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_1_4x4);
+        tmpData0 = tmpData0 - mean_vari.s0;
+        tmpData1 = tmpData1 - mean_vari.s0;
+
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp);
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp);
+
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniExtract8Data_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    }
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_2.vx
new file mode 100644
index 0000000..4d35266
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_2.vx
@@ -0,0 +1,348 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform int height;
+_viv_uniform uint group_num;
+_viv_uniform float output_zp;
+_viv_uniform float output_scale;
+_viv_uniform float inv_multiplier;
+
+#define LAYER_NORM_AXIS01_F16_F16to16Bits(name,temp_type,dst_type,conv_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_F16_F16to##name( \
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \
+    image2d_array_t output, float eps) \
+{ \
+    int gidz = get_global_id(1); \
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \
+    int2 coord_sum = (int2)(0, gidz); \
+    int4 coord_para = coord; \
+    vxc_short8 src0; \
+    vxc_short8 src1; \
+    vxc_half8 scale_h, in_h; \
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \
+    vxc_float4 mean_vari = (vxc_float4)(0); \
+ \
+    for(int i = 0; i < group_num; i++) \
+    { \
+        mean_vari += read_imagef(meanVari, coord_sum); \
+        coord_sum.x += 4; \
+    } \
+    mean_vari *= inv_multiplier; \
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
+    mean_vari.s1 = rsqrt(mean_vari.s1); \
+ \
+    coord_para.z = 0; \
+    coord_para.w = 0; \
+    int4 coord_bias = coord_para; \
+ \
+    int8 input_desc, scale_desc, output_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord_in.z, baseAddr_a); \
+ \
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); \
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; \
+    _viv_asm(MOV, coord_para.w, baseAddr_c); \
+ \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord.z, baseAddr); \
+ \
+    vxc_float4  tmpData0, tmpData1; \
+    vxc_short8 outval; \
+    temp_type tmpVal0, tmpVal1; \
+    dst_type dst; \
+ \
+    for(coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.y ++; \
+        coord_para.y = coord.y; \
+        coord_bias.y = coord.y; \
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        bias_f0 = read_imagef(bias, coord_bias); \
+        coord_bias.x += 4; \
+        bias_f1 = read_imagef(bias, coord_bias); \
+        coord_bias.x = coord.x; \
+ \
+        _viv_asm(COPY, in_h, src0, 16); \
+        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniDataToFP32_0_4x4); \
+        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniDataToFP32_1_4x4); \
+ \
+        _viv_asm(COPY, scale_h, src1, 16); \
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_0_4x4); \
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_1_4x4); \
+ \
+        vxc_float4 sub, norm; \
+        sub = tmpData0 - mean_vari.s0; \
+        norm = scale_f0 * mean_vari.s1 * sub + bias_f0; \
+        norm = norm * output_scale + output_zp; \
+        _viv_asm(conv_type, tmpVal0, norm); \
+        sub = tmpData1 - mean_vari.s0; \
+        norm = scale_f1 * mean_vari.s1 * sub + bias_f1; \
+        norm = norm * output_scale + output_zp; \
+        _viv_asm(conv_type, tmpVal1, norm); \
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                    uniExtract8Data_2x8); \
+        _viv_asm(COPY, outval, dst, 16); \
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+    } \
+}
+LAYER_NORM_AXIS01_F16_F16to16Bits(F16,half4,vxc_half8,CONV)
+LAYER_NORM_AXIS01_F16_F16to16Bits(I16,int4,vxc_short8,CONV_RTE)
+
+
+#define LAYER_NORM_AXIS01_F16_F32to16Bits(name,temp_type,dst_type,conv_type)  \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_F16_F32to##name( \
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \
+    image2d_array_t output, float eps) \
+{ \
+    int gidz = get_global_id(1); \
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \
+    int2 coord_sum = (int2)(0, gidz); \
+    int4 coord_para = coord; \
+    vxc_short8 src0; \
+    vxc_half8 in_h; \
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \
+    vxc_float4 mean_vari = (vxc_float4)(0); \
+ \
+    for(int i = 0; i < group_num; i++) \
+    { \
+        mean_vari += read_imagef(meanVari, coord_sum); \
+        coord_sum.x += 4; \
+    } \
+    mean_vari *= inv_multiplier; \
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
+    mean_vari.s1 = rsqrt(mean_vari.s1); \
+ \
+    coord_para.z = 0; \
+    coord_para.w = 0; \
+    int4 coord_bias = coord_para; \
+ \
+    int8 input_desc, scale_desc, output_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord_in.z, baseAddr_a); \
+ \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord.z, baseAddr); \
+ \
+    vxc_float4  tmpData0, tmpData1; \
+    vxc_short8 outval; \
+    temp_type tmpVal0, tmpVal1; \
+    dst_type dst; \
+ \
+    for(coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.y ++; \
+        coord_bias.y = coord.y; \
+        bias_f0 = read_imagef(bias, coord_bias); \
+        scale_f0 = read_imagef(scale, coord_bias); \
+        coord_bias.x += 4; \
+        bias_f1 = read_imagef(bias, coord_bias); \
+        scale_f1 = read_imagef(scale, coord_bias); \
+        coord_bias.x = coord.x; \
+ \
+        _viv_asm(COPY, in_h, src0, 16); \
+        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniDataToFP32_0_4x4); \
+        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniDataToFP32_1_4x4); \
+ \
+        vxc_float4 sub, norm; \
+        sub = tmpData0 - mean_vari.s0; \
+        norm = scale_f0 * mean_vari.s1 * sub + bias_f0; \
+        norm = norm * output_scale + output_zp; \
+        _viv_asm(conv_type, tmpVal0, norm); \
+        sub = tmpData1 - mean_vari.s0; \
+        norm = scale_f1 * mean_vari.s1 * sub + bias_f1; \
+        norm = norm * output_scale + output_zp; \
+        _viv_asm(conv_type, tmpVal1, norm); \
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                    uniExtract8Data_2x8); \
+        _viv_asm(COPY, outval, dst, 16); \
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+    } \
+}
+LAYER_NORM_AXIS01_F16_F32to16Bits(F16,half4,vxc_half8,CONV)
+LAYER_NORM_AXIS01_F16_F32to16Bits(I16,int4,vxc_short8,CONV_RTE)
+
+#define LAYER_NORM_AXIS01_F16_F16toQUANT(name,dst_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_F16_F16to##name( \
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \
+    image2d_array_t output, float eps) \
+{ \
+    int gidz = get_global_id(1); \
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \
+    int2 coord_sum = (int2)(0, gidz); \
+    int4 coord_para = coord; \
+    vxc_short8 src0; \
+    vxc_short8 src1; \
+    vxc_half8 scale_h, in_h; \
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \
+    vxc_float4 mean_vari = (vxc_float4)(0); \
+ \
+    for(int i = 0; i < group_num; i++) \
+    { \
+        mean_vari += read_imagef(meanVari, coord_sum); \
+        coord_sum.x += 4; \
+    } \
+    mean_vari *= inv_multiplier; \
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
+    mean_vari.s1 = rsqrt(mean_vari.s1); \
+ \
+    coord_para.z = 0; \
+    coord_para.w = 0; \
+    int4 coord_bias = coord_para; \
+ \
+    int8 input_desc, scale_desc, output_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord_in.z, baseAddr_a); \
+ \
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); \
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; \
+    _viv_asm(MOV, coord_para.w, baseAddr_c); \
+ \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord.z, baseAddr); \
+ \
+    vxc_float4  tmpData0, tmpData1; \
+    dst_type outval; \
+    vxc_int4 tmpVal0, tmpVal1; \
+ \
+    for(coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.y ++; \
+        coord_para.y = coord.y; \
+        coord_bias.y = coord.y; \
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        bias_f0 = read_imagef(bias, coord_bias); \
+        coord_bias.x += 4; \
+        bias_f1 = read_imagef(bias, coord_bias); \
+        coord_bias.x = coord.x; \
+ \
+        _viv_asm(COPY, in_h, src0, 16); \
+        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniDataToFP32_0_4x4); \
+        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniDataToFP32_1_4x4); \
+ \
+        _viv_asm(COPY, scale_h, src1, 16); \
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_0_4x4); \
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_1_4x4); \
+ \
+        vxc_float4 sub, norm; \
+        sub = tmpData0 - mean_vari.s0; \
+        norm = scale_f0 * mean_vari.s1 * sub + bias_f0; \
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \
+        sub = tmpData1 - mean_vari.s0; \
+        norm = scale_f1 * mean_vari.s1 * sub + bias_f1; \
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniExtract8Data_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+    } \
+}
+LAYER_NORM_AXIS01_F16_F16toQUANT(U8,vxc_uchar16)
+LAYER_NORM_AXIS01_F16_F16toQUANT(I8,vxc_char16)
+
+#define LAYER_NORM_AXIS01_F16_F32toQUANT(name,dst_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_F16_F32to##name( \
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \
+    image2d_array_t output, float eps) \
+{ \
+    int gidz = get_global_id(1); \
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \
+    int2 coord_sum = (int2)(0, gidz); \
+    int4 coord_para = coord; \
+    vxc_short8 src0; \
+    vxc_half8 in_h; \
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \
+    vxc_float4 mean_vari = (vxc_float4)(0); \
+ \
+    for(int i = 0; i < group_num; i++) \
+    { \
+        mean_vari += read_imagef(meanVari, coord_sum); \
+        coord_sum.x += 4; \
+    } \
+    mean_vari *= inv_multiplier; \
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
+    mean_vari.s1 = rsqrt(mean_vari.s1); \
+ \
+    coord_para.z = 0; \
+    coord_para.w = 0; \
+    int4 coord_bias = coord_para; \
+ \
+    int8 input_desc, scale_desc, output_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord_in.z, baseAddr_a); \
+ \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord.z, baseAddr); \
+ \
+    vxc_float4  tmpData0, tmpData1; \
+    dst_type outval; \
+    vxc_int4 tmpVal0, tmpVal1; \
+ \
+    for(coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.y ++; \
+        coord_bias.y = coord.y; \
+ \
+        bias_f0 = read_imagef(bias, coord_bias); \
+        scale_f0 = read_imagef(scale, coord_bias); \
+        coord_bias.x += 4; \
+        bias_f1 = read_imagef(bias, coord_bias); \
+        scale_f1 = read_imagef(scale, coord_bias); \
+        coord_bias.x = coord.x; \
+ \
+        _viv_asm(COPY, in_h, src0, 16); \
+        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniDataToFP32_0_4x4); \
+        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniDataToFP32_1_4x4); \
+ \
+        vxc_float4 sub, norm; \
+        sub = tmpData0 - mean_vari.s0; \
+        norm = scale_f0 * mean_vari.s1 * sub + bias_f0; \
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \
+        sub = tmpData1 - mean_vari.s0; \
+        norm = scale_f1 * mean_vari.s1 * sub + bias_f1; \
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniExtract8Data_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+    } \
+}
+LAYER_NORM_AXIS01_F16_F32toQUANT(U8,vxc_uchar16)
+LAYER_NORM_AXIS01_F16_F32toQUANT(I8,vxc_char16)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_3.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_3.vx
new file mode 100644
index 0000000..ae812a2
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_3.vx
@@ -0,0 +1,178 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform int height;
+_viv_uniform uint group_num;
+_viv_uniform float output_zp;
+_viv_uniform float output_scale;
+_viv_uniform float inv_multiplier;
+
+#define LAYER_NORM_AXIS01_I16_F16to16Bits(name,temp_type,dst_type,conv_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I16_F16to##name( \
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \
+    image2d_array_t output, float eps) \
+{ \
+    int gidz = get_global_id(1); \
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \
+    int2 coord_sum = (int2)(0, gidz); \
+    int4 coord_para = coord; \
+    vxc_short8 src0, src1; \
+    vxc_half8 scale_h; \
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \
+    vxc_float4 mean_vari = (vxc_float4)(0); \
+ \
+    for(int i = 0; i < group_num; i++) \
+    { \
+        mean_vari += read_imagef(meanVari, coord_sum); \
+        coord_sum.x += 4; \
+    } \
+    mean_vari *= inv_multiplier; \
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
+    mean_vari.s1 = rsqrt(mean_vari.s1); \
+ \
+    coord_para.z = 0; \
+    coord_para.w = 0; \
+    int4 coord_bias = coord_para; \
+ \
+    int8 input_desc, scale_desc, output_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord_in.z, baseAddr_a); \
+ \
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); \
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; \
+    _viv_asm(MOV, coord_para.w, baseAddr_c); \
+ \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord.z, baseAddr); \
+ \
+    vxc_float4  tmpData0, tmpData1, norm; \
+    temp_type tmpVal0, tmpVal1; \
+    vxc_short8 outval; \
+    dst_type dst; \
+ \
+    for(coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.y ++; \
+        coord_para.y = coord.y; \
+        coord_bias.y = coord.y; \
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        bias_f0 = read_imagef(bias, coord_bias); \
+        coord_bias.x += 4; \
+        bias_f1 = read_imagef(bias, coord_bias); \
+        coord_bias.x = coord.x; \
+ \
+        _viv_asm(COPY, scale_h, src1, 16); \
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_0_4x4); \
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_1_4x4); \
+        VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_0_4x4); \
+        VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_1_4x4); \
+        tmpData0 = tmpData0 - mean_vari.s0; \
+        tmpData1 = tmpData1 - mean_vari.s0; \
+ \
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; \
+        norm = norm * output_scale + output_zp; \
+        _viv_asm(conv_type, tmpVal0, norm); \
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; \
+        norm = norm * output_scale + output_zp; \
+        _viv_asm(conv_type, tmpVal1, norm); \
+ \
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniExtract8Data_2x8); \
+        _viv_asm(COPY, outval, dst, 16); \
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+    } \
+}
+LAYER_NORM_AXIS01_I16_F16to16Bits(F16,half4,vxc_half8,CONV)
+LAYER_NORM_AXIS01_I16_F16to16Bits(I16,int4,vxc_short8,CONV_RTE)
+
+
+#define LAYER_NORM_AXIS01_I16_F32to16Bits(name,temp_type,dst_type,conv_type) \
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I16_F32to##name( \
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \
+    image2d_array_t output, float eps) \
+{ \
+    int gidz = get_global_id(1); \
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \
+    int2 coord_sum = (int2)(0, gidz); \
+    int4 coord_para = coord; \
+    vxc_short8 src0; \
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \
+    vxc_float4 mean_vari = (vxc_float4)(0); \
+ \
+    for(int i = 0; i < group_num; i++) \
+    { \
+        mean_vari += read_imagef(meanVari, coord_sum); \
+        coord_sum.x += 4; \
+    } \
+    mean_vari *= inv_multiplier; \
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
+    mean_vari.s1 = rsqrt(mean_vari.s1); \
+ \
+    coord_para.z = 0; \
+    coord_para.w = 0; \
+    int4 coord_bias = coord_para; \
+ \
+    int8 input_desc, scale_desc, output_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord_in.z, baseAddr_a); \
+ \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord.z, baseAddr); \
+ \
+    vxc_float4  tmpData0, tmpData1, norm; \
+    temp_type tmpVal0, tmpVal1; \
+    vxc_short8 outval; \
+    dst_type dst; \
+ \
+    for(coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.y ++; \
+        coord_bias.y = coord.y; \
+        bias_f0 = read_imagef(bias, coord_bias); \
+        scale_f0 = read_imagef(scale, coord_bias); \
+        coord_bias.x += 4; \
+        bias_f1 = read_imagef(bias, coord_bias); \
+        scale_f1 = read_imagef(scale, coord_bias); \
+        coord_bias.x = coord.x; \
+ \
+        VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_0_4x4); \
+        VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniDataToFP32_1_4x4); \
+        tmpData0 = tmpData0 - mean_vari.s0; \
+        tmpData1 = tmpData1 - mean_vari.s0; \
+ \
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; \
+        norm = norm * output_scale + output_zp; \
+        _viv_asm(conv_type, tmpVal0, norm); \
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; \
+        norm = norm * output_scale + output_zp; \
+        _viv_asm(conv_type, tmpVal1, norm); \
+ \
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+                uniExtract8Data_2x8); \
+        _viv_asm(COPY, outval, dst, 16); \
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
+    } \
+}
+LAYER_NORM_AXIS01_I16_F32to16Bits(F16,half4,vxc_half8,CONV)
+LAYER_NORM_AXIS01_I16_F32to16Bits(I16,int4,vxc_short8,CONV_RTE)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_sum.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_sum.vx
new file mode 100644
index 0000000..9e87880
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_axis01_sum.vx
@@ -0,0 +1,228 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniSumX_16x1;
+_viv_uniform VXC_512Bits uniSumX2_16x1;
+_viv_uniform VXC_512Bits uniSum_X_X2_8x2;
+_viv_uniform int width;
+_viv_uniform int height;
+
+
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_sums_F16toF32(
+    image2d_array_t input, image2d_t output)
+{
+    int gidx = get_global_id(0) << 3;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(gidx, 0, gidz, 0);
+    vxc_short8 src0;
+    vxc_half8 in_h;
+    vxc_float4 sumsqr;
+    vxc_float4 tmpSumSqr = (vxc_float4)(0);
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr_a);
+
+    if(gidx < width)
+    {
+        for(coord.y = 0; coord.y < height;)
+        {
+            VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            _viv_asm(COPY, in_h, src0, 16);
+            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+                uniSum_X_X2_8x2);
+            tmpSumSqr += sumsqr;
+        }
+    }
+
+    lcl_sum[lidx] = tmpSumSqr.x;
+    lcl_sqr[lidx] = tmpSumSqr.y;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+
+        float sum = 0;
+        float sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+
+        float4 data = (float4)(sum, sqr, 0, 0);
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_sums_I16toF32(
+    image2d_array_t input, image2d_t output)
+{
+    int gidx = get_global_id(0) << 3;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(gidx, 0, gidz, 0);
+    vxc_short8 src0;
+    float4 tmpSumSqr = (float4)(0);
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr_a);
+
+    if(gidx < width)
+    {
+        for(coord.y = 0; coord.y < height;)
+        {
+            VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            vxc_float4 sumsqr;
+            VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\
+                    uniSum_X_X2_8x2);
+            tmpSumSqr += sumsqr;
+        }
+    }
+    lcl_sum[lidx] = tmpSumSqr.x;
+    lcl_sqr[lidx] = tmpSumSqr.y;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+        float4 data = (float4)(0);
+        for(int i = 0; i < 4; i++)
+        {
+            data.x += dot(tmp_sum[i], one);
+            data.y += dot(tmp_sqr[i], one);
+        }
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_sums_U8toF32(
+    image2d_array_t input, image2d_t output)
+{
+    int gidx = get_global_id(0) << 4;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(gidx, 0, gidz, 0);
+    vxc_uchar16 src0;
+    float sum = 0, sqr = 0;
+    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr_a);
+
+    if(gidx < width)
+    {
+        for(coord.y = 0; coord.y < height;)
+        {
+            VXC_OP4(img_load_3d, src0, input, coord.xywz, 0,
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1);
+            tmpSum += (tmpSum1);
+            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1);
+            tmpSqr += (tmpSqr1);
+        }
+        sqr += convert_float(tmpSqr);
+        sum = convert_float(tmpSum);
+    }
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+        sum = 0; sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+        float4 data = (float4)(sum, sqr, 0, 0);
+        write_imagef(output, coord_out, data);
+    }
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_sums_I8toF32(
+    image2d_array_t input, image2d_t output)
+{
+    int gidx = get_global_id(0) << 4;
+    int lidx = get_local_id(0);
+    int gidz = get_global_id(1);
+    int4 coord = (int4)(gidx, 0, gidz, 0);
+    vxc_char16 src0;
+    float sum = 0, sqr = 0;
+    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;
+
+    __local float lcl_sum[16];
+    __local float lcl_sqr[16];
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord.w, baseAddr_a);
+
+    if(gidx < width)
+    {
+        for(coord.y = 0; coord.y < height;)
+        {
+            VXC_OP4(img_load_3d, src0, input, coord.xywz, 0,
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1);
+            tmpSum += (tmpSum1);
+            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1);
+            tmpSqr += (tmpSqr1);
+        }
+        sqr += convert_float(tmpSqr);
+        sum = convert_float(tmpSum);
+    }
+    lcl_sum[lidx] = sum;
+    lcl_sqr[lidx] = sqr;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);
+    if(lidx == 0)
+    {
+        float4 one = (float4)(1, 1, 1, 1);
+        __local float4* tmp_sum = (__local float4*)lcl_sum;
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;
+        sum = 0; sqr = 0;
+        for(int i = 0; i < 4; i++)
+        {
+            sum += dot(tmp_sum[i], one);
+            sqr += dot(tmp_sqr[i], one);
+        }
+        float4 data = (float4)(sum, sqr, 0, 0);
+        write_imagef(output, coord_out, data);
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis0.vx
new file mode 100644
index 0000000..0f673cb
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis0.vx
@@ -0,0 +1,190 @@
+#include "cl_viv_vx_ext.h"
+_viv_uniform float       rlogE;
+_viv_uniform int         axisSize;
+_viv_uniform float       betaValue;
+_viv_uniform float       scaleLogE;
+_viv_uniform float       outputScale;
+_viv_uniform float       output_offset_asymmetric;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform int         height;
+_viv_uniform int         inputWidth;
+_viv_uniform int         inputWidthRemain4;
+_viv_uniform VXC_512Bits uniGetSubData0to3_4x4;
+_viv_uniform VXC_512Bits uniGetSubData4to7_4x4;
+_viv_uniform VXC_512Bits uniPackMaxData_2x8;
+
+#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0(read_fun, vert_max_fun, horz_max_fun) \
+    read_fun(val0, input,  coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, val, val0, 16); \
+    for (coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        for (coord.x = 16;coord.x < (axisSize + 16);coord.x+=32) \
+        { \
+            read_fun(val0, input,  coord, VXC_5BITOFFSET_XY(-16, 0), \
+                                          VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            _viv_asm(COPY, img_val0, val0, 16); \
+            read_fun(val1, input,  coord, VXC_5BITOFFSET_XY(-8, 0), \
+                                          VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            _viv_asm(COPY, img_val1, val1, 16); \
+            read_fun(val2, input,  coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            _viv_asm(COPY, img_val2, val2, 16); \
+            read_fun(val3, input,  coord, VXC_5BITOFFSET_XY(8, 0), \
+                                          VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            _viv_asm(COPY, img_val3, val3, 16); \
+            vert_max_fun(val, img_val0, img_val1, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            vert_max_fun(val, img_val2, img_val3, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        } \
+    } \
+    horz_max_fun(val, val, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_DP2x8(val, val, val, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniPackMaxData_2x8); \
+    horz_max_fun(val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_float4 prob; \
+    float fProbSum = 0; \
+    const float4 one4 = (float4)(1.0, 1.0, 1.0, 1.0); \
+    for (coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        for (coord.x = 0;coord.x < inputWidth;coord.x+=4) \
+        { \
+            read_fun(val0, input,  coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+            _viv_asm(COPY, img_val0, val0, 16); \
+            VXC_DP4x4(prob, img_val0, val,\
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \
+            prob *= scaleLogE; \
+            prob = exp2(prob); \
+            fProbSum += dot(prob, one4); \
+        } \
+    } \
+    read_fun(val0, input,  coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, img_val0, val0, 16); \
+    VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \
+    prob *= scaleLogE; \
+    if(inputWidthRemain4 == 1) \
+    { \
+        prob.x = exp2(prob.x); \
+        prob.yzw = 0; \
+        fProbSum += dot(prob, one4); \
+    } \
+    else if(inputWidthRemain4 == 2) \
+    { \
+        prob.x = exp2(prob.x); \
+        prob.y = exp2(prob.y); \
+        prob.zw = 0; \
+        fProbSum += dot(prob, one4); \
+    } \
+    else if(inputWidthRemain4 == 3) \
+    { \
+        prob.x = exp2(prob.x); \
+        prob.y = exp2(prob.y); \
+        prob.z = exp2(prob.z); \
+        prob.w = 0; \
+        fProbSum += dot(prob, one4); \
+    } \
+    vxc_float4 probSum_log; \
+    probSum_log.x = log2(fProbSum) * rlogE;
+
+#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_SAVE(dst_type, \
+        save_type, conv_mode, OUT_SCALE, OUT_OFFSET, read_fun, write_fun) \
+    for (coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        for (coord.x = 0; coord.x < axisSize; coord.x += 8) \
+        { \
+            dst_type vec0, vec1; \
+            save_type dst; \
+            read_fun(val0, input,  coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            _viv_asm(COPY, img_val0, val0, 16); \
+            VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \
+            prob = prob * betaValue - probSum_log.xxxx; \
+            prob = prob * OUT_SCALE + OUT_OFFSET; \
+            _viv_asm(conv_mode, vec0, prob); \
+            VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData4to7_4x4); \
+            prob = prob * betaValue - probSum_log.xxxx; \
+            prob = prob * OUT_SCALE + OUT_OFFSET; \
+            _viv_asm(conv_mode, vec1, prob); \
+            VXC_DP2x8(dst, vec0, vec1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \
+            write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        } \
+    }
+
+#define LOGSOFTMAX_EXCEED_AXIS0(src_name, dst_name, src_type, copy_type, dst_type,\
+                        save_type, conv_mode, OUT_SCALE, OUT_OFFSET, vert_max_fun, horz_max_fun) \
+__kernel void log_softmax_exceed_axis0_##src_name##to##dst_name \
+    ( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+    float input_Scale, \
+    int   axisVal \
+    ) \
+{ \
+    int4 coord = (int4)(16, 0, get_global_id(1), 0); \
+    src_type img_val0, img_val1, img_val2, img_val3; \
+    copy_type val0, val1, val2, val3; \
+    src_type val; \
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS0(VXC_ReadImage2DArray, vert_max_fun, horz_max_fun) \
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS0_SAVE(dst_type, save_type, conv_mode,\
+    OUT_SCALE, OUT_OFFSET, VXC_ReadImage2DArray, VXC_WriteImage2DArray); \
+}
+
+LOGSOFTMAX_EXCEED_AXIS0(F16, F16, vxc_half8, vxc_short8, half4,  vxc_short8,\
+CONV, 1, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half)
+LOGSOFTMAX_EXCEED_AXIS0(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\
+CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half)
+LOGSOFTMAX_EXCEED_AXIS0(F16, I8,  vxc_half8, vxc_short8, char4,  vxc_char8,\
+CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half)
+LOGSOFTMAX_EXCEED_AXIS0(F16, U8,  vxc_half8, vxc_short8, uchar4, vxc_uchar8,\
+                 CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Half, VXC_HorzMax3_Half)
+LOGSOFTMAX_EXCEED_AXIS0(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\
+CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)
+LOGSOFTMAX_EXCEED_AXIS0(I16, F16, vxc_short8, vxc_short8, half4,  vxc_short8,\
+CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)
+LOGSOFTMAX_EXCEED_AXIS0(I8, I8,  vxc_char16, vxc_char16, char4,  vxc_char8,\
+CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)
+LOGSOFTMAX_EXCEED_AXIS0(I8, F16, vxc_char16, vxc_char16, half4,  vxc_short8,\
+CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)
+LOGSOFTMAX_EXCEED_AXIS0(U8, U8,  vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8,\
+CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)
+LOGSOFTMAX_EXCEED_AXIS0(U8, F16, vxc_uchar16, vxc_uchar16, half4,  vxc_short8,\
+CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)
+
+
+#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_TOF32_SAVE(read_fun) \
+    for (coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        for (coord.x = 0; coord.x < axisSize; ) \
+        { \
+            read_fun(val0, input,  coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            _viv_asm(COPY, img_val0, val0, 16); \
+            VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \
+            prob = prob * betaValue - probSum_log.xxxx; \
+            write_imagef(output, coord, prob); \
+            coord.x += 4; \
+            VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData4to7_4x4); \
+            prob = prob * betaValue - probSum_log.xxxx; \
+            write_imagef(output, coord, prob); \
+            coord.x += 4; \
+        } \
+    }
+
+#define LOGSOFTMAX_EXCEED_AXIS0_TOF32(src_name, src_type, copy_type, vert_max_fun, horz_max_fun) \
+__kernel void log_softmax_exceed_axis0_##src_name##toF32 \
+    ( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+    float input_Scale, \
+    int   axisVal \
+    ) \
+{ \
+    int4 coord = (int4)(16, 0, get_global_id(1), 0); \
+    src_type img_val0, img_val1, img_val2, img_val3; \
+    copy_type val0, val1, val2, val3; \
+    src_type val; \
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS0(VXC_ReadImage2DArray, vert_max_fun, horz_max_fun) \
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS0_TOF32_SAVE(VXC_ReadImage2DArray) \
+}
+
+LOGSOFTMAX_EXCEED_AXIS0_TOF32(F16, vxc_half8,   vxc_short8, VXC_VertMax3_Half, VXC_HorzMax3_Half)
+LOGSOFTMAX_EXCEED_AXIS0_TOF32(I16, vxc_short8,  vxc_short8, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)
+LOGSOFTMAX_EXCEED_AXIS0_TOF32(I8,  vxc_char16,  vxc_char16, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)
+LOGSOFTMAX_EXCEED_AXIS0_TOF32(U8,  vxc_uchar16, vxc_uchar16, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis0_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis0_BF16.vx
new file mode 100644
index 0000000..45f904f
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis0_BF16.vx
@@ -0,0 +1,187 @@
+#include "cl_viv_vx_ext.h"
+_viv_uniform float       rlogE;
+_viv_uniform int         axisSize;
+_viv_uniform float       betaValue;
+_viv_uniform float       scaleLogE;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+
+_viv_uniform int         height;
+_viv_uniform int         inputWidth;
+_viv_uniform int         inputWidthRemain4;
+_viv_uniform VXC_512Bits uniPackMaxData_2x8;
+_viv_uniform VXC_512Bits uniExtractHalf4_4x4;
+
+
+#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16(read_fun) \
+    vxc_half8 img_val0, img_val1, img_val2, img_val3; \
+    vxc_short8 val0, val1, val2, val3; \
+    vxc_half8 val; \
+    read_fun(val0, input,  coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, val, val0, 16); \
+    for (coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        for (coord.x = 16; coord.x < (axisSize + 16);) \
+        { \
+            read_fun(val0, input,  coord, VXC_5BITOFFSET_XY(-16, 0), \
+                                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            _viv_asm(COPY, img_val0, val0, 16); \
+            read_fun(val1, input,  coord, VXC_5BITOFFSET_XY(-8, 0), \
+                                  VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            _viv_asm(COPY, img_val1, val1, 16); \
+            read_fun(val2, input,  coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            _viv_asm(COPY, img_val2, val2, 16); \
+            read_fun(val3, input,  coord, VXC_5BITOFFSET_XY(8, 0), \
+                                  VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            _viv_asm(COPY, img_val3, val3, 16); \
+            coord.x += 32; \
+            VXC_VertMax3_Half(val, img_val0, img_val1, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            VXC_VertMax3_Half(val, img_val2, img_val3, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        } \
+    } \
+    VXC_HorzMax3_Half(val, val, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_DP2x8(val, val, val, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniPackMaxData_2x8); \
+    VXC_HorzMax3_Half(val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
+    vxc_ushort8   bf_val_tmp; \
+    vxc_float4 vecA; \
+    _viv_asm(COPY, bf_val_tmp, val, 16); \
+    VXC_DP2x8(bf_val_tmp, bf_val_tmp, zero,\
+    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \
+    _viv_asm(COPY, vecA, bf_val_tmp, 16); \
+    vxc_float4 prob; \
+    float fProbSum = 0; \
+    const float4 one4 = (float4)(1.0, 1.0, 1.0, 1.0); \
+    float max_value = vecA.x * scaleLogE; \
+    float max_value_orig = vecA.x; \
+    for (coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        for (coord.x = 0; coord.x < inputWidth; ) \
+        { \
+            read_fun(val0, input,  coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+            VXC_DP2x8(bf_val_tmp, val0, zero,\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \
+            _viv_asm(COPY, prob, bf_val_tmp, 16); \
+            prob = prob * scaleLogE - max_value; \
+            prob = exp2(prob); \
+            fProbSum += dot(prob, one4); \
+            coord.x += 4; \
+        } \
+    } \
+    read_fun(val0, input,  coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_DP2x8(bf_val_tmp, val0, zero,\
+    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \
+    _viv_asm(COPY, prob, bf_val_tmp, 16); \
+    prob = prob * scaleLogE - max_value; \
+    if(inputWidthRemain4 == 1) \
+    { \
+        prob.x = exp2(prob.x); \
+        prob.yzw = 0; \
+        fProbSum += dot(prob, one4); \
+    } \
+    else if(inputWidthRemain4 == 2) \
+    { \
+        prob.x = exp2(prob.x); \
+        prob.y = exp2(prob.y); \
+        prob.zw = 0; \
+        fProbSum += dot(prob, one4); \
+    } \
+    else if(inputWidthRemain4 == 3) \
+    { \
+        prob.x = exp2(prob.x); \
+        prob.y = exp2(prob.y); \
+        prob.z = exp2(prob.z); \
+        prob.w = 0; \
+        fProbSum += dot(prob, one4); \
+    } \
+    vxc_float4 probSum_log; \
+    probSum_log.x = log2(fProbSum) * rlogE;
+
+#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOBF16_SAVE(read_fun, write_fun) \
+    for (coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        for (coord.x = 0; coord.x < axisSize; ) \
+        { \
+            read_fun(val0, input,  coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+            VXC_DP2x8(bf_val_tmp, val0, zero,\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \
+            _viv_asm(COPY, prob, bf_val_tmp, 16); \
+            prob = prob - max_value_orig; \
+            prob = prob * betaValue - probSum_log.xxxx; \
+            vxc_ushort8 tmp, dst; \
+            _viv_asm(COPY, tmp, prob, 16); \
+            dst.s0123 = tmp.s1357; \
+            write_fun(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+            coord.x += 4; \
+        } \
+    }
+
+#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOF16_SAVE(read_fun, write_fun) \
+    for (coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        for (coord.x = 0; coord.x < axisSize; ) \
+        { \
+            read_fun(val0, input,  coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+            VXC_DP2x8(bf_val_tmp, val0, zero,\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \
+            _viv_asm(COPY, prob, bf_val_tmp, 16); \
+            prob = prob - max_value_orig; \
+            prob = prob * betaValue - probSum_log.xxxx; \
+            half4 vec; \
+            vxc_half4 tmp; \
+            vxc_short4 dst; \
+            _viv_asm(CONV, vec, prob); \
+            VXC_DP4x4(tmp, vec, vec, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \
+            _viv_asm(COPY, dst, tmp, 8); \
+            write_fun(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+            coord.x += 4; \
+        } \
+    }
+
+#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOF32_SAVE(read_fun) \
+    for (coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        for (coord.x = 0; coord.x < axisSize; ) \
+        { \
+            read_fun(val0, input,  coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+            VXC_DP2x8(bf_val_tmp, val0, zero,\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \
+            _viv_asm(COPY, prob, bf_val_tmp, 16); \
+            prob = prob - max_value_orig; \
+            prob = prob * betaValue - probSum_log.xxxx; \
+            write_imagef(output, coord, prob); \
+            coord.x += 4; \
+        } \
+    }
+
+__kernel void log_softmax_exceed_axis0_BF16toBF16(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    float input_Scale,
+    int   axisVal )
+{
+    int4 coord = (int4)(16, 0, get_global_id(1), 0);
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16(VXC_ReadImage2DArray)
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOBF16_SAVE(VXC_ReadImage2DArray, VXC_WriteImage2DArray)
+}
+__kernel void log_softmax_exceed_axis0_BF16toF16(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    float input_Scale,
+    int   axisVal )
+{
+    int4 coord = (int4)(16, 0, get_global_id(1), 0);
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16(VXC_ReadImage2DArray)
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOF16_SAVE(VXC_ReadImage2DArray, VXC_WriteImage2DArray)
+}
+__kernel void log_softmax_exceed_axis0_BF16toF32(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    float input_Scale,
+    int   axisVal )
+{
+    int4 coord = (int4)(16, 0, get_global_id(1), 0);
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16(VXC_ReadImage2DArray)
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOF32_SAVE(VXC_ReadImage2DArray)
+}
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis1.vx
new file mode 100644
index 0000000..179735c
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis1.vx
@@ -0,0 +1,172 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float       rlogE;
+_viv_uniform int         depth;
+_viv_uniform int         axisSize;
+_viv_uniform float       betaValue;
+_viv_uniform float       scaleLogE;
+_viv_uniform float       outputScale;
+_viv_uniform float       output_offset_asymmetric;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniGetSubLoData_4x4;
+_viv_uniform VXC_512Bits uniGetSubHiData_4x4;
+
+#define LOGSOFTMAX_PROCESS_EXCEED_AXIS1(read_fun, vert_max_fun) \
+    read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, max, in0, 16); \
+    for (coord.z = 0; coord.z < depth; coord.z ++) \
+    { \
+        for (coord.y = 0; coord.y < axisSize;) \
+        { \
+            read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            _viv_asm(COPY, vec0, in0, 16); \
+            vert_max_fun(max, max, max, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            coord.y++; \
+        } \
+    } \
+    coord.y = 0; \
+    sum0 = 0; \
+    sum1 = 0; \
+    for (coord.z = 0; coord.z < depth; coord.z ++) \
+    { \
+        for (coord.y = 0; coord.y < axisSize;coord.y++) \
+        { \
+            read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            _viv_asm(COPY, vec0, in0, 16); \
+            VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \
+                                         VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \
+            data0 *= scaleLogE; \
+            data0 = exp2(data0); \
+            sum0 += data0; \
+            VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \
+                                         VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \
+            data0 *= scaleLogE; \
+            data0 = exp2(data0); \
+            sum1 += data0; \
+        } \
+    } \
+    sum0 = log2(sum0) * rlogE; \
+    sum1 = log2(sum1) * rlogE;
+
+#define LOGSOFTMAX_PROCESS_EXCEED_AXIS1_SAVE(dst_type, save_type, conv_mode,\
+                 OUT_SCALE, OUT_OFFSET, read_fun, write_fun) \
+    coord.y = 0; \
+    dst_type dst0, dst1; \
+    save_type vect; \
+    for (coord.z = 0; coord.z < depth; coord.z ++) \
+    { \
+        for (coord.y = 0; coord.y < axisSize;) \
+        { \
+            read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            _viv_asm(COPY, vec0, in0, 16); \
+            VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \
+                                              VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \
+            data0 = data0 * betaValue - sum0; \
+            data0 = data0 * OUT_SCALE + OUT_OFFSET; \
+            _viv_asm(conv_mode, dst0, data0); \
+            VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \
+                                               VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \
+            data0 = data0 * betaValue - sum1; \
+            data0 = data0 * OUT_SCALE + OUT_OFFSET; \
+            _viv_asm(conv_mode, dst1, data0); \
+            VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, \
+                                          VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \
+            write_fun(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            coord.y++; \
+        } \
+    } \
+
+#define LOGSOFTMAX_EXCEED_AXIS1(src_name, dst_name, src_type, copy_type, dst_type,\
+save_type, conv_mode, OUT_SCALE, OUT_OFFSET, vert_max_fun) \
+__kernel void log_softmax_exceed_axis1_##src_name##to##dst_name \
+    ( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+    float input_Scale, \
+    int   axisVal \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), 0, 0, 0); \
+    src_type vec0, max; \
+    copy_type in0; \
+    vxc_float4 data0; \
+    vxc_float4 sum0, sum1; \
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS1(VXC_ReadImage2DArray, vert_max_fun) \
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS1_SAVE(dst_type, save_type, conv_mode,\
+    OUT_SCALE, OUT_OFFSET, VXC_ReadImage2DArray, VXC_WriteImage2DArray); \
+}
+
+
+
+LOGSOFTMAX_EXCEED_AXIS1(F16, F16, vxc_half8, vxc_short8, half4,  vxc_short8,\
+CONV, 1, 0, VXC_VertMax3_Half)
+LOGSOFTMAX_EXCEED_AXIS1(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\
+CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half)
+LOGSOFTMAX_EXCEED_AXIS1(F16, I8,  vxc_half8, vxc_short8, char4,  vxc_char8,\
+CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half)
+LOGSOFTMAX_EXCEED_AXIS1(F16, U8,  vxc_half8, vxc_short8, uchar4,\
+vxc_uchar8, CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Half)
+
+LOGSOFTMAX_EXCEED_AXIS1(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\
+CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer)
+LOGSOFTMAX_EXCEED_AXIS1(I16, F16, vxc_short8, vxc_short8, half4,  vxc_short8,\
+CONV, 1, 0, VXC_VertMax3_Integer)
+
+LOGSOFTMAX_EXCEED_AXIS1(I8, I8,  vxc_char16, vxc_char16, char4,  vxc_char8,\
+CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer)
+LOGSOFTMAX_EXCEED_AXIS1(I8, F16, vxc_char16, vxc_char16, half4,  vxc_short8,\
+CONV, 1, 0, VXC_VertMax3_Integer)
+
+LOGSOFTMAX_EXCEED_AXIS1(U8, U8,  vxc_uchar16, vxc_uchar16, uchar4,\
+vxc_uchar8, CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Integer)
+LOGSOFTMAX_EXCEED_AXIS1(U8, F16, vxc_uchar16, vxc_uchar16, half4, \
+vxc_short8, CONV, 1, 0, VXC_VertMax3_Integer)
+
+
+
+#define LOGSOFTMAX_EXCEED_AXIS1_TOF32(src_name, src_type, copy_type, vert_max_fun) \
+__kernel void log_softmax_exceed_axis1_##src_name##toF32 \
+    ( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+    float input_Scale, \
+    int   axisVal \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), 0, 0, 0); \
+    src_type vec0, max; \
+    copy_type in0; \
+    vxc_float4 data0; \
+    vxc_float4 sum0, sum1; \
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS1(VXC_ReadImage2DArray, vert_max_fun) \
+    coord.y = 0; \
+    for (coord.z = 0; coord.z < depth; coord.z ++) \
+    { \
+        for (coord.y = 0; coord.y < axisSize;) \
+        { \
+            VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            _viv_asm(COPY, vec0, in0, 16); \
+            VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \
+                                  VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \
+            data0 = data0 * betaValue - sum0; \
+            write_imagef(output, coord, data0); \
+            coord.x += 4; \
+            VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \
+                                   VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \
+            data0 = data0 * betaValue - sum1; \
+            write_imagef(output, coord, data0); \
+            coord.x -= 4; \
+            coord.y++; \
+        } \
+    } \
+}
+
+LOGSOFTMAX_EXCEED_AXIS1_TOF32(F16, vxc_half8, \
+vxc_short8, VXC_VertMax3_Half)
+LOGSOFTMAX_EXCEED_AXIS1_TOF32(I16, vxc_short8, \
+vxc_short8, VXC_VertMax3_Integer)
+LOGSOFTMAX_EXCEED_AXIS1_TOF32(I8,  vxc_char16, \
+vxc_char16, VXC_VertMax3_Integer)
+LOGSOFTMAX_EXCEED_AXIS1_TOF32(U8,  vxc_uchar16, \
+vxc_uchar16, VXC_VertMax3_Integer)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis1_BF16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis1_BF16.vx
new file mode 100644
index 0000000..f592e31
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/log_softmax_exceed_axis1_BF16.vx
@@ -0,0 +1,180 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float       rlogE;
+_viv_uniform int         depth;
+_viv_uniform int         axisSize;
+_viv_uniform float       betaValue;
+_viv_uniform float       scaleLogE;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+
+_viv_uniform VXC_512Bits uniExtractHalf8_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+
+#define LOGSOFTMAX_PROCESS_EXCEED_AXIS1_BF16(read_fun) \
+    read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, max, in0, 16); \
+    for (coord.z = 0; coord.z < depth; coord.z ++) \
+    { \
+        for (coord.y = 0; coord.y < axisSize;) \
+        { \
+            read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            _viv_asm(COPY, vec0, in0, 16); \
+            VXC_VertMax3_Half(max, max, max, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            coord.y++; \
+        } \
+    } \
+    _viv_asm(COPY, tmp0, max, 16); \
+    VXC_DP2x8(tmp1, tmp0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \
+    _viv_asm(COPY, max_lo, tmp1, 16); \
+    VXC_DP2x8(tmp1, tmp0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \
+    _viv_asm(COPY, max_hi, tmp1, 16); \
+    coord.y = 0; \
+    sum0 = 0; \
+    sum1 = 0; \
+    for (coord.z = 0; coord.z < depth; coord.z ++) \
+    { \
+        for (coord.y = 0; coord.y < axisSize;) \
+        { \
+            read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+                                                             uniConvBF16toF32_Part0_2x8); \
+            _viv_asm(COPY, data0, tmp1, 16); \
+            data0 = data0 - max_lo; \
+            data0 *= scaleLogE; \
+            sum0  += exp2(data0); \
+            VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+                                                              uniConvBF16toF32_Part1_2x8); \
+            _viv_asm(COPY, data0, tmp1, 16); \
+            data0 = data0 - max_hi; \
+            data0 *= scaleLogE; \
+            sum1  += exp2(data0); \
+            coord.y++; \
+        } \
+    } \
+    sum0 = log2(sum0) * rlogE; \
+    sum1 = log2(sum1) * rlogE;
+
+__kernel void log_softmax_exceed_axis1_BF16toBF16(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    float input_Scale,
+    int   axisVal )
+{
+    int4 coord = (int4)(get_global_id(0), 0, 0, 0);
+    vxc_short8 in0;
+    vxc_half8 vec0, max;
+    vxc_float4 data0;
+    vxc_float4 sum0, sum1;
+    vxc_float4 max_lo, max_hi;
+    vxc_ushort8   tmp0, tmp1;
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS1_BF16(VXC_ReadImage2DArray)
+
+    coord.y = 0;
+    vxc_ushort8 dst0, dst1, dst;
+    for (coord.z = 0; coord.z < depth; coord.z ++)
+    {
+        for (coord.y = 0; coord.y < axisSize;)
+        {
+            VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+            _viv_asm(COPY, data0, tmp1, 16);
+            data0 = data0 - max_lo;
+            data0 = data0 * betaValue - sum0;
+            _viv_asm(COPY, dst0, data0, 16);
+            VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+            _viv_asm(COPY, data0, tmp1, 16);
+            data0 = data0 - max_hi;
+            data0 = data0 * betaValue - sum1;
+            _viv_asm(COPY, dst1, data0, 16);
+            VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+        }
+    }
+}
+
+__kernel void log_softmax_exceed_axis1_BF16toF16(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    float input_Scale,
+    int   axisVal )
+{
+    int4 coord = (int4)(get_global_id(0), 0, 0, 0);
+    vxc_short8 in0;
+    vxc_half8 vec0, max;
+    vxc_float4 data0;
+    vxc_float4 sum0, sum1;
+    vxc_float4 max_lo, max_hi;
+    vxc_ushort8   tmp0, tmp1;
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS1_BF16(VXC_ReadImage2DArray)
+
+    coord.y = 0;
+    half4 dst0, dst1;
+    for (coord.z = 0; coord.z < depth; coord.z ++)
+    {
+        for (coord.y = 0; coord.y < axisSize;)
+        {
+            VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+            _viv_asm(COPY, data0, tmp1, 16);
+            data0 = data0 - max_lo;
+            data0 = data0 * betaValue - sum0;
+            _viv_asm(CONV, dst0, data0);
+            VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+            _viv_asm(COPY, data0, tmp1, 16);
+            data0 = data0 - max_hi;
+            data0 = data0 * betaValue - sum1;
+            _viv_asm(CONV, dst1, data0);
+            VXC_DP2x8(vec0, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8);
+            vxc_short8 vect;
+            _viv_asm(COPY, vect, vec0, 16);
+            VXC_WriteImage2DArray(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            coord.y++;
+        }
+    }
+}
+
+__kernel void log_softmax_exceed_axis1_BF16toF32(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    float input_Scale,
+    int   axisVal )
+{
+    int4 coord = (int4)(get_global_id(0), 0, 0, 0);
+    vxc_short8 in0;
+    vxc_half8 vec0, max;
+    vxc_float4 data0;
+    vxc_float4 sum0, sum1;
+    vxc_float4 max_lo, max_hi;
+    vxc_ushort8   tmp0, tmp1;
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS1_BF16(VXC_ReadImage2DArray)
+
+    coord.y = 0;
+    for (coord.z = 0; coord.z < depth; coord.z ++)
+    {
+        for (coord.y = 0; coord.y < axisSize;)
+        {
+            VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+            _viv_asm(COPY, data0, tmp1, 16);
+            data0 = data0 - max_lo;
+            data0 = data0 * betaValue - sum0;
+            write_imagef(output, coord, data0);
+            coord.x += 4;
+            VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+            _viv_asm(COPY, data0, tmp1, 16);
+            data0 = data0 - max_hi;
+            data0 = data0 * betaValue - sum1;
+            write_imagef(output, coord, data0);
+            coord.x -= 4;
+            coord.y++;
+        }
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_rggb_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_rggb_copy.vx
new file mode 100644
index 0000000..ae92f69
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_rggb_copy.vx
@@ -0,0 +1,111 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int bOrder;
+_viv_uniform int rOrder;
+
+_viv_uniform float outputScaleVar_b;
+_viv_uniform float outputScaleVar_g;
+_viv_uniform float outputScaleVar_r;
+
+_viv_uniform float bMeanScaleVarZp;
+_viv_uniform float gMeanScaleVarZp;
+_viv_uniform float rMeanScaleVarZp;
+
+_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;
+_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;
+_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;
+
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;
+_viv_uniform VXC_512Bits uniExtractYtoShortSub16_2x8;
+_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4;
+
+#define NV12_RGGB_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \
+__kernel void pre_process_nv12_rggb_copy_##name \
+    ( \
+    __read_only  image2d_array_t y_img, \
+    __read_only  image2d_array_t uv_img, \
+    __write_only image2d_array_t output, \
+    global       int*            xRatio, \
+    global       int*            yRatio, \
+    global       int*            xOffset, \
+    global       int*            yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           r_scale, \
+                 int             reverse_channel, \
+                 int             trans, \
+                 int             nv_type, \
+                 float           g_scale, \
+                 float           b_scale \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+ \
+    int sy = gidy + (*yOffset); \
+    int sx = gidx + (*xOffset); \
+    int uvX = sx & 0xfffffffe; \
+    int uvY = sy >> 1; \
+ \
+    vxc_uchar16 Y, UV; \
+ \
+    VXC_ReadImage(Y, y_img, (int2)(sx,sy), 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(UV, uv_img,(int2)(uvX,uvY), 0,VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+  \
+    if (nv_type == 3) \
+    { \
+        UV.s0123 = UV.s1032; \
+    } \
+ \
+    vxc_short8 tmpY; \
+    vxc_char16 tmpUV; \
+    short tmpVal = 16; \
+    VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractYtoShortSub16_2x8); \
+    tmpVal = 128; \
+    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \
+ \
+    float4 tmpDstB, tmpDstG, tmpDstR; \
+    vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \
+    VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \
+    VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \
+    VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \
+    VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \
+              VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
+    VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \
+              VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
+    VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \
+              VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
+ \
+    conv_type result; \
+    dst_type dst0; \
+    save_type dst; \
+    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
+    tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstB); \
+    dstPos.z = bOrder; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstG); \
+    dstPos.z = 1; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    dstPos.z = 2; \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstR); \
+    dstPos.z = rOrder; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+NV12_RGGB_COPY_SH_IMPL(U8toU8,  vxc_uchar8, int4,  vxc_uchar8, 8)
+NV12_RGGB_COPY_SH_IMPL(U8toI8,  vxc_char8,  int4,  vxc_char8,  8)
+NV12_RGGB_COPY_SH_IMPL(U8toI16, vxc_short8, int4,  vxc_short8, 16)
+NV12_RGGB_COPY_SH_IMPL(U8toF16, vxc_half8,  half4, vxc_short8, 16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_rggb_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_rggb_scale.vx
new file mode 100644
index 0000000..ade2a15
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_rggb_scale.vx
@@ -0,0 +1,247 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int bOrder;
+_viv_uniform int rOrder;
+
+_viv_uniform float outputScaleVar_b;
+_viv_uniform float outputScaleVar_g;
+_viv_uniform float outputScaleVar_r;
+
+_viv_uniform float bMeanScaleVarZp;
+_viv_uniform float gMeanScaleVarZp;
+_viv_uniform float rMeanScaleVarZp;
+
+_viv_uniform uint  xrIntFloat_16;
+_viv_uniform uint  yrIntFloat_16;
+
+_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;
+_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;
+_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;
+
+_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;
+
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;
+_viv_uniform VXC_512Bits uniConvertYtoShortSub16_2x8;
+
+_viv_uniform VXC_512Bits uniCalculateYShift_2x8;
+_viv_uniform VXC_512Bits uniCalculateUVShift_2x8;
+_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4;
+
+#define NV12_RGGB_OPT_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \
+__kernel void pre_process_nv12_rggb_scale_##name##_gq \
+    ( \
+    __read_only  image2d_array_t y_img, \
+    __read_only  image2d_array_t uv_img, \
+    __write_only image2d_array_t output, \
+    global       int*            xRatio, \
+    global       int*            yRatio, \
+    global       int*            xOffset, \
+    global       int*            yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           r_scale, \
+                 int             reverse_channel, \
+                 int             trans, \
+                 int             nv_type, \
+                 float           g_scale, \
+                 float           b_scale \
+    ) \
+{ \
+    uint4 gidx = get_global_id(0); \
+    uint gidy = get_global_id(1); \
+    gidx += (uint4)(0, 1, 2, 3); \
+ \
+    uint dy = (gidy * yrIntFloat_16) >> 16; \
+    uint4 dx = (gidx * xrIntFloat_16) >> 16; \
+    int sy = convert_int(dy) + (*yOffset); \
+    int4 sx = convert_int4(dx) + (*xOffset); \
+    int4 uvX = sx & 0xfffffffe; \
+    int uvY = sy >> 1; \
+ \
+    vxc_uchar16 Y, UV; \
+    int2 coord = (int2)(sx.x, sy); \
+    int2 coord_uv = (int2)(uvX.x, uvY); \
+    VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    if (nv_type == 3) \
+    { \
+        UV.s0123456789abcdef = UV.s1032547698badcfe; \
+    } \
+ \
+    vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; \
+    vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; \
+    int4 offsetUV = uvX - uvX.x; \
+ \
+    vxc_ushort8 diffY, diffUV; \
+    _viv_asm(COPY, diffY, sx, 16); \
+    _viv_asm(COPY, diffUV, offsetUV, 16); \
+ \
+    vxc_ushort8 constData = 8; \
+    VXC_DP2x8(maskShift, diffY, constData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+                uniCalculateYShift_2x8); \
+    VXC_DP2x8(maskShiftUv, diffUV, constData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                uniCalculateUVShift_2x8); \
+    VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_short8 tmpY; \
+    vxc_char16 tmpUV; \
+    short tmpVal = 16; \
+    VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertYtoShortSub16_2x8); \
+    tmpVal = 128; \
+    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \
+ \
+    float4 tmpDstB, tmpDstG, tmpDstR; \
+    vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \
+    VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \
+    VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \
+    VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \
+    VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \
+                       VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
+    VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \
+                       VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
+    VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \
+                       VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
+ \
+    conv_type result; \
+    dst_type dst0; \
+    save_type dst; \
+    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
+    tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstB); \
+    dstPos.z = bOrder; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstG); \
+    dstPos.z = 1; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    dstPos.z = 2; \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstR); \
+    dstPos.z = rOrder; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+NV12_RGGB_OPT_SH_IMPL(U8toU8,  vxc_uchar8, int4,  vxc_uchar8, 8)
+NV12_RGGB_OPT_SH_IMPL(U8toI8,  vxc_char8,  int4,  vxc_char8,  8)
+NV12_RGGB_OPT_SH_IMPL(U8toI16, vxc_short8, int4,  vxc_short8, 16)
+NV12_RGGB_OPT_SH_IMPL(U8toF16, vxc_half8,  half4, vxc_short8, 16)
+
+#define NV12_RGGB_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \
+__kernel void pre_process_nv12_rggb_scale_##name \
+    ( \
+    __read_only  image2d_array_t y_img, \
+    __read_only  image2d_array_t uv_img, \
+    __write_only image2d_array_t output, \
+    global       int*            xRatio, \
+    global       int*            yRatio, \
+    global       int*            xOffset, \
+    global       int*            yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           r_scale, \
+                 int             reverse_channel, \
+                 int             trans, \
+                 int             nv_type, \
+                 float           g_scale, \
+                 float           b_scale \
+    ) \
+{ \
+    uint4 gidx = get_global_id(0); \
+    uint gidy = get_global_id(1); \
+    gidx += (uint4)(0, 1, 2, 3); \
+ \
+    uint dy = (gidy * yrIntFloat_16) >> 16; \
+    uint4 dx = (gidx * xrIntFloat_16) >> 16; \
+    int sy = convert_int(dy) + (*yOffset); \
+    int4 sx = convert_int4(dx) + (*xOffset); \
+    int4 uvX = sx & 0xfffffffe; \
+    int uvY = sy >> 1; \
+ \
+    vxc_uchar16 Y, UV; \
+    int2 coord = (int2)(sx.x, sy); \
+    int2 coord_uv = (int2)(uvX.x, uvY); \
+ \
+    VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    coord.x = sx.y; \
+    VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord.x = sx.z; \
+    VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    coord.x = sx.w; \
+    VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_uv.x = uvX.y; \
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_uv.x = uvX.z; \
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_uv.x = uvX.w; \
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    if (nv_type == 3) \
+    { \
+        UV.s01234567 = UV.s10325476; \
+    } \
+ \
+    vxc_short8 tmpY; \
+    vxc_char16 tmpUV; \
+    short tmpVal = 16; \
+    VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertYtoShortSub16_2x8); \
+    tmpVal = 128; \
+    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \
+ \
+    float4 tmpDstB, tmpDstG, tmpDstR; \
+    vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \
+    VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \
+    VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \
+    VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \
+    VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \
+              VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
+    VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \
+              VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
+    VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \
+              VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \
+ \
+    conv_type result; \
+    dst_type dst0; \
+    save_type dst; \
+    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
+    tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstB); \
+    dstPos.z = bOrder; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstG); \
+    dstPos.z = 1; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    dstPos.z = 2; \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstR); \
+    dstPos.z = rOrder; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+NV12_RGGB_SH_IMPL(U8toU8,  vxc_uchar8, int4,  vxc_uchar8, 8)
+NV12_RGGB_SH_IMPL(U8toI8,  vxc_char8,  int4,  vxc_char8,  8)
+NV12_RGGB_SH_IMPL(U8toI16, vxc_short8, int4,  vxc_short8, 16)
+NV12_RGGB_SH_IMPL(U8toF16, vxc_half8,  half4, vxc_short8, 16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx
index 2546ca5..5c09554 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_F16.vx
@@ -89,80 +89,79 @@ __kernel void resize_bilinear_F16toF16_DOWN
         VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
 
-__kernel void resize_bilinear_F16toU8_DOWN
-    (
-    __read_only     image2d_array_t input,
-    __write_only    image2d_array_t output,
-    int   align_corners,
-    int   half_pixel_centers
-    )
-{
-    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
-    int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3);
-    float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;
-    float4 left_x_f    = floor(in_x);
-    float4 x_lerp      = in_x - left_x_f;
-    int4   left_x_idx  = convert_int4(left_x_f);
-    float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;
-    float  top_y_f     = floor(in_y);
-    float  y_lerp      = in_y - top_y_f;
-    int    top_y_idx   = convert_int(top_y_f);
-
-    vxc_short8 top_short, bottom_short;
-    vxc_half8  top, bottom;
-    int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
-
-    int8 input_desc;
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
-    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
-    _viv_asm(MOV, coord_in.w, baseAddr);
-    VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
-            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
-    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
-            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.y;
-    VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
-            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
-    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
-            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.z;
-    VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
-            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
-    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
-            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
-    coord_in.x = left_x_idx.w;
-    VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
-            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
-            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
-    _viv_asm(COPY, top,    top_short, 16);
-    _viv_asm(COPY, bottom, bottom_short, 16);
-
-    float4 left4;
-    float4 right4;
-    float4 top4;
-    float4 bottom4;
-    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
-    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
-    top4        = right4 * x_lerp + left4;
-    VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);
-    VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
-    bottom4      = right4 * x_lerp + left4;
-    bottom4     -= top4;
-    float4 dst4  = bottom4 * y_lerp + top4;
-    dst4         = dst4 * uint8Scale + output_ZP;
-    int4 dst     = convert_int4_rte(dst4);
-    vxc_uchar8 dst_uchar;
-    VXC_DP2x8(dst_uchar, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
-
-    int8 output_desc;
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
-    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
-    _viv_asm(MOV, coord_out.w, baseAddr);
-
-    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst_uchar,
-        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+#define RESIZE_BILINEAR_F16TOQINT_DOWN(out_name, dst_type) \
+__kernel void resize_bilinear_F16to##out_name##_DOWN( \
+    __read_only     image2d_array_t input, \
+    __write_only    image2d_array_t output, \
+    int   align_corners, \
+    int   half_pixel_centers \
+    ) \
+{ \
+    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3); \
+    float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value; \
+    float4 left_x_f    = floor(in_x); \
+    float4 x_lerp      = in_x - left_x_f; \
+    int4   left_x_idx  = convert_int4(left_x_f); \
+    float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; \
+    float  top_y_f     = floor(in_y); \
+    float  y_lerp      = in_y - top_y_f; \
+    int    top_y_idx   = convert_int(top_y_f); \
+ \
+    vxc_short8 top_short, bottom_short; \
+    vxc_half8  top, bottom; \
+    int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); \
+ \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = left_x_idx.y; \
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = left_x_idx.z; \
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = left_x_idx.w; \
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, top,    top_short, 16); \
+    _viv_asm(COPY, bottom, bottom_short, 16); \
+ \
+    float4 left4, right4, top4, bottom4; \
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4); \
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+    top4        = right4 * x_lerp + left4; \
+    VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4); \
+    VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+    bottom4      = right4 * x_lerp + left4; \
+    bottom4     -= top4; \
+    float4 dst4  = bottom4 * y_lerp + top4; \
+    dst4         = dst4 * uint8Scale + output_ZP; \
+    int4 dst     = convert_int4_rte(dst4); \
+    dst_type dst_uchar; \
+    VXC_DP2x8(dst_uchar, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+ \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+ \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst_uchar, \
+        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
 }
+RESIZE_BILINEAR_F16TOQINT_DOWN(U8, vxc_uchar8)
+RESIZE_BILINEAR_F16TOQINT_DOWN(U16, vxc_ushort8)
 
 __kernel void resize_bilinear_F16toF16_UP
     (
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx
index 8f4735b..b195ee7 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I16.vx
@@ -1,13 +1,15 @@
 #include "cl_viv_vx_ext.h"
 
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;
+_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;
 _viv_uniform VXC_512Bits uniExtact8Bit_2x8;
 _viv_uniform float2 scale_xy;
 _viv_uniform int depth;
+_viv_uniform int input_ZP;
+_viv_uniform float uint8Scale;
+_viv_uniform float output_ZP;
 _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;
 _viv_uniform VXC_512Bits uniGetMaskShift_2x8;
-_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;
-_viv_uniform VXC_512Bits uniRightSubLeft_4x4;
-_viv_uniform float dfpScale;
 _viv_uniform float half_pixel_value;
 
 __kernel void resize_bilinear_I16toI16_UP
@@ -56,23 +58,23 @@ __kernel void resize_bilinear_I16toI16_UP
     VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1),
             VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 
+    short inputZP;
+    _viv_asm(COPY, inputZP, input_ZP, 4);
+
     vxc_ushort8 bitextract_p0;
     vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
     VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \
-    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);
     vxc_ushort8 constData = 16;
     VXC_DP2x8(maskShift, bitextract_p0, constData, \
-    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);
 
     int8 output_desc;
     _viv_asm(COPY, output_desc, output, sizeof(output_desc));
     baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
     _viv_asm(MOV, coord_out.w, baseAddr);
 
-    float4 left4;
-    float4 right4;
-    float4 top4;
-    float4 bottom4;
+    float4 left4, right4, top4, bottom4;
 
     int loop = depth - 1;
     while (coord_in.z < loop)
@@ -91,18 +93,18 @@ __kernel void resize_bilinear_I16toI16_UP
         VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1),
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 
-        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
-        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+        VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
         top4        = right4 * x_lerp + left4;
 
-        VXC_DP4x4(left4, bottom, bottom, \
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+        VXC_DP4x4(left4, bottom, inputZP, \
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
         VXC_DP4x4(right4, bottom, bottom, \
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
         bottom4      = right4 * x_lerp + left4;
         bottom4     -= top4;
         float4 dst4  = bottom4 * y_lerp + top4;
-        dst4         = dst4 * dfpScale;
+        dst4         = dst4 * uint8Scale + output_ZP;
         int4 dst     = convert_int4_rte(dst4);
 
         VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
@@ -115,17 +117,17 @@ __kernel void resize_bilinear_I16toI16_UP
     VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
     _viv_asm(COPY, top, dst0, 16);
     _viv_asm(COPY, bottom, dst1, 16);
-    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
-    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
     top4        = right4 * x_lerp + left4;
-    VXC_DP4x4(left4, bottom, bottom, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+    VXC_DP4x4(left4, bottom, inputZP, \
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
     VXC_DP4x4(right4, bottom, bottom, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
     bottom4      = right4 * x_lerp + left4;
     bottom4     -= top4;
     float4 dst4  = bottom4 * y_lerp + top4;
-    dst4         = dst4 * dfpScale;
+    dst4         = dst4 * uint8Scale + output_ZP;
     int4 dst     = convert_int4_rte(dst4);
     VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
     VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
@@ -180,25 +182,25 @@ __kernel void resize_bilinear_I16toI16_DOWN
     VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
             VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
 
-    float4 left4;
-    float4 right4;
-    float4 top4;
-    float4 bottom4;
+    short inputZP;
+    _viv_asm(COPY, inputZP, input_ZP, 4);
 
-    VXC_DP4x4(left4, top, top, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
-    VXC_DP4x4(right4, top, top, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    float4 left4, right4, top4, bottom4;
+
+    VXC_DP4x4(left4, top, inputZP, \
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+    VXC_DP4x4(right4, top, inputZP, \
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
     top4        = right4 * x_lerp + left4;
 
-    VXC_DP4x4(left4, bottom, bottom, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
-    VXC_DP4x4(right4, bottom, bottom, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    VXC_DP4x4(left4, bottom, inputZP, \
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, inputZP, \
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
     bottom4      = right4 * x_lerp + left4;
     bottom4     -= top4;
     float4 dst4  = bottom4 * y_lerp + top4;
-    dst4         = dst4 * dfpScale;
+    dst4         = dst4 * uint8Scale + output_ZP;
     int4 dst     = convert_int4_rte(dst4);
 
     VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
@@ -209,6 +211,6 @@ __kernel void resize_bilinear_I16toI16_DOWN
     _viv_asm(MOV, coord_out.w, baseAddr);
 
     VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
-        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+                    VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
 }
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx
index bcb465e..1364370 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_I8.vx
@@ -1,13 +1,15 @@
 #include "cl_viv_vx_ext.h"
 
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;
+_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;
 _viv_uniform VXC_512Bits uniExtact8Bit_2x8;
 _viv_uniform float2 scale_xy;
 _viv_uniform int depth;
+_viv_uniform int input_ZP;
+_viv_uniform float uint8Scale;
+_viv_uniform float output_ZP;
 _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;
 _viv_uniform VXC_512Bits uniGetMaskShift_2x8;
-_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;
-_viv_uniform VXC_512Bits uniRightSubLeft_4x4;
-_viv_uniform float dfpScale;
 _viv_uniform float half_pixel_value;
 
 __kernel void resize_bilinear_I8toI8_UP
@@ -52,13 +54,16 @@ __kernel void resize_bilinear_I8toI8_UP
     VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
 
+    short inputZP;
+    _viv_asm(COPY, inputZP, input_ZP, 4);
+
     vxc_ushort8 bitextract_p0;
     vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
     VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \
-    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);
     vxc_ushort8 constData = 8;
     VXC_DP2x8(maskShift, bitextract_p0, constData, \
-    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);
 
     int8 output_desc;
     _viv_asm(COPY, output_desc, output, sizeof(output_desc));
@@ -84,22 +89,22 @@ __kernel void resize_bilinear_I8toI8_UP
         VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
                 VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
 
-        VXC_DP4x4(left4, top, top, \
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+        VXC_DP4x4(left4, top, inputZP, \
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
         VXC_DP4x4(right4, top, top, \
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
 
         top4        = right4 * x_lerp + left4;
 
-        VXC_DP4x4(left4, bottom, bottom, \
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+        VXC_DP4x4(left4, bottom, inputZP, \
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
         VXC_DP4x4(right4, bottom, bottom, \
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
 
         bottom4      = right4 * x_lerp + left4;
         bottom4     -= top4;
         float4 dst4  = bottom4 * y_lerp + top4;
-        dst4         = dst4 * dfpScale;
+        dst4         = dst4 * uint8Scale + output_ZP;
         int4 dst     = convert_int4_rte(dst4);
         VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
 
@@ -111,19 +116,19 @@ __kernel void resize_bilinear_I8toI8_UP
     VXC_BitExtract(dst1, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
     _viv_asm(COPY, top, dst0, 16);
     _viv_asm(COPY, bottom, dst1, 16);
-    VXC_DP4x4(left4, top, top, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+    VXC_DP4x4(left4, top, inputZP, \
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
     VXC_DP4x4(right4, top, top, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
     top4        = right4 * x_lerp + left4;
-    VXC_DP4x4(left4, bottom, bottom, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
+    VXC_DP4x4(left4, bottom, inputZP, \
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
     VXC_DP4x4(right4, bottom, bottom, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
     bottom4      = right4 * x_lerp + left4;
     bottom4     -= top4;
     float4 dst4  = bottom4 * y_lerp + top4;
-    dst4         = dst4 * dfpScale;
+    dst4         = dst4 * uint8Scale + output_ZP;
     int4 dst     = convert_int4_rte(dst4);
     VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
     VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
@@ -174,25 +179,28 @@ __kernel void resize_bilinear_I8toI8_DOWN
     VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
             VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
 
+    short inputZP;
+    _viv_asm(COPY, inputZP, input_ZP, 4);
+
     float4 left4;
     float4 right4;
     float4 top4;
     float4 bottom4;
 
-    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
-    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+    VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
     top4        = right4 * x_lerp + left4;
 
-    VXC_DP4x4(left4, bottom, bottom, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);
-    VXC_DP4x4(right4, bottom, bottom, \
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);
+    VXC_DP4x4(left4, bottom, inputZP, \
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, inputZP, \
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
     bottom4      = right4 * x_lerp + left4;
 
     bottom4     -= top4;
     float4 dst4  = bottom4 * y_lerp + top4;
 
-    dst4         = dst4 * dfpScale;
+    dst4         = dst4 * uint8Scale + output_ZP;
 
     int4 dst     = convert_int4_rte(dst4);
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U16.vx
new file mode 100644
index 0000000..46cdb40
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U16.vx
@@ -0,0 +1,278 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;
+_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
+_viv_uniform float2 scale_xy;
+_viv_uniform int depth;
+_viv_uniform int input_ZP;
+_viv_uniform float uint8Scale;
+_viv_uniform float output_ZP;
+_viv_uniform VXC_512Bits uniConvertI32toI16_2x8;
+_viv_uniform VXC_512Bits uniGetMaskShift_2x8;
+_viv_uniform float half_pixel_value;
+
+__kernel void resize_bilinear_U16toF16_DOWN
+    (
+    __read_only     image2d_array_t input,
+    __write_only    image2d_array_t output,
+    int   align_corners,
+    int   half_pixel_centers
+    )
+{
+    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3);
+    float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;
+    float4 left_x_f    = floor(in_x);
+    float4 x_lerp      = in_x - left_x_f;
+    int4   left_x_idx  = convert_int4(left_x_f);
+    float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;
+    float  top_y_f     = floor(in_y);
+    float  y_lerp      = in_y - top_y_f;
+    int    top_y_idx   = convert_int(top_y_f);
+    vxc_ushort8 top, bottom;
+    int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = left_x_idx.y;
+    VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = left_x_idx.z;
+    VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = left_x_idx.w;
+    VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+
+    float4 left4, right4, top4, bottom4;
+
+    short inputZP;
+    _viv_asm(COPY, inputZP, input_ZP, 4);
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+    VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+    top4        = right4 * x_lerp + left4;
+
+    VXC_DP4x4(left4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+    bottom4      = right4 * x_lerp + left4;
+
+    bottom4     -= top4;
+    float4 dst4  = bottom4 * y_lerp + top4;
+
+    dst4 *=  uint8Scale;
+
+    half4 dst;
+    _viv_asm(CONV, dst, dst4);
+
+    vxc_short8 dst_short;
+    _viv_asm(COPY, dst_short, dst, 16);
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst_short.s0246,
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel void resize_bilinear_U16toU16_UP
+    (
+    image2d_array_t input,
+    image2d_array_t output,
+    int   align_corners,
+    int   half_pixel_centers
+    )
+{
+    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3);
+    float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;
+
+    float4 left_x_f    = floor(in_x);
+    float4 x_lerp      = in_x - left_x_f;
+    int4   left_x_idx  = convert_int4(left_x_f);
+    float4 right_x_f   = ceil(in_x);
+    int4   right_x_idx = convert_int4(right_x_f);
+
+    float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;
+
+    float  top_y_f     = floor(in_y);
+    float  y_lerp      = in_y - top_y_f;
+    int    top_y_idx   = convert_int(top_y_f);
+
+    vxc_ushort8 src0, src1, src2, src3;
+
+    vxc_ushort8 top;
+    vxc_ushort8 bottom;
+
+    int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+
+    VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1),
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    short inputZP;
+    _viv_asm(COPY, inputZP, input_ZP, 4);
+
+    vxc_ushort8 bitextract_p0;
+    vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};
+    VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);
+    vxc_ushort8 constData = 16;
+    VXC_DP2x8(maskShift, bitextract_p0, constData, \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+    float4 left4, right4, top4, bottom4;
+
+    int loop = depth - 1;
+    while (coord_in.z < loop)
+    {
+        VXC_BitExtract(top, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_BitExtract(bottom, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_in.zw += (int2)(1, input_desc.s4);
+        VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1),
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+        VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+        top4        = right4 * x_lerp + left4;
+
+        VXC_DP4x4(left4, bottom, inputZP, \
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+        VXC_DP4x4(right4, bottom, bottom, \
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+        bottom4      = right4 * x_lerp + left4;
+        bottom4     -= top4;
+        float4 dst4  = bottom4 * y_lerp + top4;
+        dst4         = dst4 * uint8Scale + output_ZP;
+        int4 dst     = convert_int4_rte(dst4);
+        VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+        coord_out.zw += (int2)(1, output_desc.s4);
+    }
+
+    VXC_BitExtract(top, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_BitExtract(bottom, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+    top4        = right4 * x_lerp + left4;
+
+    VXC_DP4x4(left4, bottom, inputZP, \
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, bottom, \
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+    bottom4      = right4 * x_lerp + left4;
+    bottom4     -= top4;
+    float4 dst4  = bottom4 * y_lerp + top4;
+    dst4         = dst4 * uint8Scale + output_ZP;
+    int4 dst     = convert_int4_rte(dst4);
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel void resize_bilinear_U16toU16_DOWN
+    (
+    __read_only     image2d_array_t input,
+    __write_only    image2d_array_t output,
+    int   align_corners,
+    int   half_pixel_centers
+    )
+{
+    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3);
+    float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;
+    float4 left_x_f    = floor(in_x);
+    float4 x_lerp      = in_x - left_x_f;
+    int4   left_x_idx  = convert_int4(left_x_f);
+    float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;
+    float  top_y_f     = floor(in_y);
+    float  y_lerp      = in_y - top_y_f;
+    int    top_y_idx   = convert_int(top_y_f);
+    vxc_ushort8 top, bottom, result;
+    int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);
+
+    int8 input_desc;
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;
+    _viv_asm(MOV, coord_in.w, baseAddr);
+    VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = left_x_idx.y;
+    VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = left_x_idx.z;
+    VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
+    coord_in.x = left_x_idx.w;
+    VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
+
+    float4 left4, right4, top4, bottom4;
+
+    short inputZP;
+    _viv_asm(COPY, inputZP, input_ZP, 4);
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+    VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+    top4        = right4 * x_lerp + left4;
+
+    VXC_DP4x4(left4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
+    VXC_DP4x4(right4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
+    bottom4      = right4 * x_lerp + left4;
+
+    bottom4     -= top4;
+    float4 dst4  = bottom4 * y_lerp + top4;
+
+    dst4         = dst4 * uint8Scale + output_ZP;
+    int4 dst     = convert_int4_rte(dst4);
+
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);
+
+    int8 output_desc;
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;
+    _viv_asm(MOV, coord_out.w, baseAddr);
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx
index 88f0cd5..12ae503 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_U8.vx
@@ -133,6 +133,9 @@ __kernel void resize_bilinear_U8toU8_UP
     VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
 
+    short inputZP;
+    _viv_asm(COPY, inputZP, input_ZP, 4);
+
     vxc_ushort8 bitextract_p0;
     vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
     VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \
@@ -163,8 +166,6 @@ __kernel void resize_bilinear_U8toU8_UP
         VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),
                 VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
 
-        unsigned char inputZP;
-        _viv_asm(COPY, inputZP, input_ZP, 4);
         VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
         VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
         top4        = right4 * x_lerp + left4;
@@ -185,8 +186,7 @@ __kernel void resize_bilinear_U8toU8_UP
 
     VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
     VXC_BitExtract(bottom, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    unsigned char inputZP;
-    _viv_asm(COPY, inputZP, input_ZP, 4);
+
     VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);
     VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);
     top4        = right4 * x_lerp + left4;
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_cubic.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_cubic.vx
new file mode 100644
index 0000000..4424c94
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_cubic.vx
@@ -0,0 +1,270 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float input_scale;
+_viv_uniform float input_tail;
+_viv_uniform float output_scale;
+_viv_uniform float output_tail;
+_viv_uniform VXC_512Bits uniFp16ToFp32_4x4;
+_viv_uniform VXC_512Bits uniExtractHalf8_2x8;
+_viv_uniform VXC_512Bits uniExtract8Bit_2x8;
+
+#define RESIZE_CUBIC_PART0 \
+    int4   coord_out    = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    int4   coord_index  = coord_out; \
+    int2   coord_scalew = (int2)(4 * get_global_id(0), 0); \
+    int2   coord_scaleh = (int2)(4 * get_global_id(1), 0); \
+    float4 cubic_coeffs_y; \
+    float4 cubic_coeffs_x; \
+    int4   coord_in     = (int4)(0, 0, coord_out.z, 0); \
+    float4 src0_f,src1_f,src2_f,src3_f; \
+    float4 dst = (float4)(0,0,0,0); \
+    float  sum[4]; \
+    int i = 0; \
+ \
+    Image scalew = create_image_from_image2d(scale_w, 4); \
+    Image scaleh = create_image_from_image2d(scale_h, 4); \
+ \
+    uchar* scale_w_ptr = get_image_ptr_from_coord(scalew, coord_scalew); \
+    __global float* scale_x = (__global float*)scale_w_ptr; \
+ \
+    uchar* scale_h_ptr = get_image_ptr_from_coord(scaleh, coord_scaleh); \
+    __global float* scale_y = (__global float*)scale_h_ptr; \
+    cubic_coeffs_y = vload4(0, scale_y); \
+ \
+    int index_y = read_imagei(index_h, coord_index.yw).x; \
+    coord_in.y = index_y; \
+    int8 input_desc, output_desc; \
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr);
+
+#define RESIZE_CUBIC_16Bitsto16Bits(name,src_type,dst_type,temp_type) \
+__kernel void resize_cubic_##name( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+    __read_only  image2d_t        scale_w, \
+    __read_only  image2d_t        scale_h, \
+    __read_only  image2d_t        index_w, \
+    __read_only  image2d_t        index_h \
+                           ) \
+{ \
+    RESIZE_CUBIC_PART0; \
+    src_type     src0_h,src1_h,src2_h,src3_h; \
+    vxc_short4   src0,src1,src2,src3; \
+    for (i = 0; i < 4; i++) \
+    { \
+        coord_in.x = read_imagei(index_w, coord_index.xw).x; \
+        cubic_coeffs_x = vload4(i, scale_x); \
+        coord_index.x = coord_index.x + 1; \
+ \
+        VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), \
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), \
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+        _viv_asm(COPY, src0_h, src0, 8); \
+        _viv_asm(COPY, src1_h, src1, 8); \
+        _viv_asm(COPY, src2_h, src2, 8); \
+        _viv_asm(COPY, src3_h, src3, 8); \
+ \
+        VXC_DP4x4(src0_f, src0_h, src0_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \
+        VXC_DP4x4(src1_f, src1_h, src1_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \
+        VXC_DP4x4(src2_f, src2_h, src2_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \
+        VXC_DP4x4(src3_f, src3_h, src3_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \
+ \
+        dst = src0_f * cubic_coeffs_y.x \
+            + src1_f * cubic_coeffs_y.y \
+            + src2_f * cubic_coeffs_y.z \
+            + src3_f * cubic_coeffs_y.w; \
+        sum[i] = dot(dst, cubic_coeffs_x); \
+    } \
+    float4 sum_f = (float4)(sum[0],sum[1],sum[2],sum[3]); \
+    temp_type tmpout; \
+    _viv_asm(CONV,tmpout,sum_f); \
+    dst_type out_h; \
+    vxc_short4 out; \
+    VXC_DP2x8(out_h, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \
+    _viv_asm(COPY, out, out_h, 8); \
+ \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+RESIZE_CUBIC_16Bitsto16Bits(F16toF16,vxc_half4, vxc_half4, half4)
+RESIZE_CUBIC_16Bitsto16Bits(I16toI16,vxc_short4,vxc_short4,short4)
+RESIZE_CUBIC_16Bitsto16Bits(F16toI16,vxc_half4, vxc_short4,short4)
+RESIZE_CUBIC_16Bitsto16Bits(I16toF16,vxc_short4,vxc_half4, half4)
+
+
+#define RESIZE_CUBIC_Quant8toQuant8(name,data_type) \
+__kernel void resize_cubic_##name( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+    __read_only  image2d_t        scale_w, \
+    __read_only  image2d_t        scale_h, \
+    __read_only  image2d_t        index_w, \
+    __read_only  image2d_t        index_h \
+                           ) \
+{ \
+    RESIZE_CUBIC_PART0; \
+    data_type   src0,src1,src2,src3; \
+    for (i = 0; i < 4; i++) \
+    { \
+        coord_in.x = read_imagei(index_w, coord_index.xw).x; \
+        cubic_coeffs_x = vload4(i, scale_x); \
+        coord_index.x = coord_index.x + 1; \
+ \
+        VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), \
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), \
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \
+        VXC_DP4x4(src1_f, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \
+        VXC_DP4x4(src2_f, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \
+        VXC_DP4x4(src3_f, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \
+        src0_f = src0_f * input_scale + input_tail; \
+        src1_f = src1_f * input_scale + input_tail; \
+        src2_f = src2_f * input_scale + input_tail; \
+        src3_f = src3_f * input_scale + input_tail; \
+ \
+        dst = src0_f * cubic_coeffs_y.x \
+            + src1_f * cubic_coeffs_y.y \
+            + src2_f * cubic_coeffs_y.z \
+            + src3_f * cubic_coeffs_y.w; \
+        sum[i] = dot(dst, cubic_coeffs_x); \
+        sum[i] = sum[i] * output_scale + output_tail; \
+    } \
+    float4 sum_f = (float4)(sum[0],sum[1],sum[2],sum[3]); \
+    int4 tmpout; \
+    _viv_asm(CONV,tmpout,sum_f); \
+    data_type out; \
+    VXC_DP2x8(out, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \
+ \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+RESIZE_CUBIC_Quant8toQuant8(U8toU8,vxc_uchar4)
+RESIZE_CUBIC_Quant8toQuant8(I8toI8,vxc_char4 )
+
+#define RESIZE_CUBIC_F16toQuant8(name,dst_type) \
+__kernel void resize_cubic_##name( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+    __read_only  image2d_t        scale_w, \
+    __read_only  image2d_t        scale_h, \
+    __read_only  image2d_t        index_w, \
+    __read_only  image2d_t        index_h \
+                           ) \
+{ \
+    RESIZE_CUBIC_PART0; \
+    vxc_half4    src0_h,src1_h,src2_h,src3_h; \
+    vxc_short4   src0,src1,src2,src3; \
+    for (i = 0; i < 4; i++) \
+    { \
+        coord_in.x = read_imagei(index_w, coord_index.xw).x; \
+        cubic_coeffs_x = vload4(i, scale_x); \
+        coord_index.x = coord_index.x + 1; \
+ \
+        VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), \
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), \
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+        _viv_asm(COPY, src0_h, src0, 8); \
+        _viv_asm(COPY, src1_h, src1, 8); \
+        _viv_asm(COPY, src2_h, src2, 8); \
+        _viv_asm(COPY, src3_h, src3, 8); \
+ \
+        VXC_DP4x4(src0_f, src0_h, src0_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \
+        VXC_DP4x4(src1_f, src1_h, src1_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \
+        VXC_DP4x4(src2_f, src2_h, src2_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \
+        VXC_DP4x4(src3_f, src3_h, src3_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \
+ \
+        dst = src0_f * cubic_coeffs_y.x \
+            + src1_f * cubic_coeffs_y.y \
+            + src2_f * cubic_coeffs_y.z \
+            + src3_f * cubic_coeffs_y.w; \
+        sum[i] = dot(dst, cubic_coeffs_x); \
+        sum[i] = sum[i] * output_scale + output_tail; \
+    } \
+    float4 sum_f = (float4)(sum[0],sum[1],sum[2],sum[3]); \
+    int4 tmpout; \
+    _viv_asm(CONV,tmpout,sum_f); \
+    dst_type out; \
+    VXC_DP2x8(out, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \
+ \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+RESIZE_CUBIC_F16toQuant8(F16toU8,vxc_uchar4)
+RESIZE_CUBIC_F16toQuant8(F16toI8,vxc_char4)
+
+#define RESIZE_CUBIC_Quant8toF16(name,src_type) \
+__kernel void resize_cubic_##name( \
+    __read_only  image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+    __read_only  image2d_t        scale_w, \
+    __read_only  image2d_t        scale_h, \
+    __read_only  image2d_t        index_w, \
+    __read_only  image2d_t        index_h \
+                           ) \
+{ \
+    RESIZE_CUBIC_PART0; \
+    src_type src0,src1,src2,src3; \
+    for (i = 0; i < 4; i++) \
+    { \
+        coord_in.x = read_imagei(index_w, coord_index.xw).x; \
+        cubic_coeffs_x = vload4(i, scale_x); \
+        coord_index.x = coord_index.x + 1; \
+ \
+        VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), \
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), \
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \
+        VXC_DP4x4(src1_f, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \
+        VXC_DP4x4(src2_f, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \
+        VXC_DP4x4(src3_f, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \
+ \
+        src0_f = src0_f * input_scale + input_tail; \
+        src1_f = src1_f * input_scale + input_tail; \
+        src2_f = src2_f * input_scale + input_tail; \
+        src3_f = src3_f * input_scale + input_tail; \
+ \
+        dst = src0_f * cubic_coeffs_y.x \
+            + src1_f * cubic_coeffs_y.y \
+            + src2_f * cubic_coeffs_y.z \
+            + src3_f * cubic_coeffs_y.w; \
+        sum[i] = dot(dst, cubic_coeffs_x); \
+    } \
+    float4 sum_f = (float4)(sum[0],sum[1],sum[2],sum[3]); \
+    half4 tmpout; \
+    _viv_asm(CONV,tmpout,sum_f); \
+    vxc_half4 out_h; \
+    vxc_short4 out; \
+    VXC_DP2x8(out_h, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \
+    _viv_asm(COPY, out, out_h, 8); \
+ \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+RESIZE_CUBIC_Quant8toF16(U8toF16,vxc_uchar4)
+RESIZE_CUBIC_Quant8toF16(I8toF16,vxc_char4)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_reduction.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_reduction.vx
new file mode 100644
index 0000000..a10393f
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_reduction.vx
@@ -0,0 +1,259 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int update_width;
+_viv_uniform int output_width;
+
+_viv_uniform int4 coord_stride;
+_viv_uniform int4 coord_stride1;
+
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+
+_viv_uniform int input_zp;
+_viv_uniform float input_scale;
+_viv_uniform int update_zp;
+_viv_uniform float update_scale;
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;
+_viv_uniform VXC_512Bits uniConvert2ndU8SubZpToFp32_4x4;
+
+inline void AtomicAdd_float(volatile __global float *source, const float operand)
+{
+    union
+    {
+        unsigned int intVal;
+        float floatVal;
+    } newVal;
+    union
+    {
+        unsigned int intVal;
+        float floatVal;
+    } prevVal;
+    do
+    {
+        prevVal.floatVal = *source;
+        newVal.floatVal = prevVal.floatVal + operand;
+    } while(atomic_cmpxchg((volatile __global unsigned int *)source,
+                             prevVal.intVal, newVal.intVal) != prevVal.intVal);
+}
+
+inline void AtomicMul_float(volatile __global float *source, const float operand)
+{
+    union
+    {
+        unsigned int intVal;
+        float floatVal;
+    } newVal;
+    union
+    {
+        unsigned int intVal;
+        float floatVal;
+    } prevVal;
+    do
+    {
+        prevVal.floatVal = *source;
+        newVal.floatVal = prevVal.floatVal * operand;
+    } while(atomic_cmpxchg((volatile __global unsigned int *)source,
+                             prevVal.intVal, newVal.intVal) != prevVal.intVal);
+}
+
+inline void AtomicMax_float(volatile __global float *source, const float operand)
+{
+    union
+    {
+        unsigned int intVal;
+        float floatVal;
+    } newVal;
+    union
+    {
+        unsigned int intVal;
+        float floatVal;
+    } prevVal;
+    do
+    {
+        prevVal.floatVal = *source;
+        newVal.floatVal = fmax(prevVal.floatVal, operand);
+    } while(atomic_cmpxchg((volatile __global unsigned int *)source,
+                             prevVal.intVal, newVal.intVal) != prevVal.intVal);
+}
+
+inline void AtomicMin_float(volatile __global float *source, const float operand)
+{
+    union
+    {
+        unsigned int intVal;
+        float floatVal;
+    } newVal;
+    union
+    {
+        unsigned int intVal;
+        float floatVal;
+    } prevVal;
+    do
+    {
+        prevVal.floatVal = *source;
+        newVal.floatVal = fmin(prevVal.floatVal, operand);
+    } while(atomic_cmpxchg((volatile __global unsigned int *)source,
+                             prevVal.intVal, newVal.intVal) != prevVal.intVal);
+}
+
+#define SCATTER_REDUCTION_PREPROCESS(name0, ptr0, type0, len0, size0, ptr2) \
+__kernel void scatter_nd_update_reduction_preprocess_##name0( \
+    __read_only image2d_t   input_ref, \
+    image2d_t  temp_buf_float, \
+    int length, int res) \
+{ \
+    int gidx = get_global_id(0); \
+    Image img1 = create_image_from_image2d(input_ref, size0); \
+    Image img2 = create_image_from_image2d(temp_buf_float, 4); \
+    __global float* tmp_ref_ptr = (__global float*)img2.ptr; \
+    type0 src; \
+    float4 tmpDst0, tmpDst1; \
+    short zp = input_zp; \
+    if(length > 0) \
+    { \
+        __global ptr0* input_ptr = (__global ptr0*)img1.ptr; \
+        ptr0 tmpData = input_ptr[gidx]; \
+        int loc2 = gidx * 8; \
+        _viv_asm(COPY, src, tmpData, len0); \
+        VXC_DP4x4(tmpDst0, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+                    uniConvert1stUint8SubZpToFp32_4x4); \
+        VXC_DP4x4(tmpDst1, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+                    uniConvert2ndU8SubZpToFp32_4x4); \
+        tmpDst0 *= input_scale; \
+        tmpDst1 *= input_scale; \
+        vstore4(tmpDst0, 0, tmp_ref_ptr + loc2); \
+        vstore4(tmpDst1, 1, tmp_ref_ptr + loc2); \
+    } \
+    __global ptr2* input_ptr1 = (__global ptr2*)img1.ptr; \
+    for(int i = gidx; i < res; i += get_global_size(0)) \
+    { \
+        ptr2 tmpData1 = input_ptr1[length + i]; \
+        _viv_asm(COPY, src, tmpData1, 4); \
+        VXC_DP4x4(tmpDst0, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+                    uniConvert1stUint8SubZpToFp32_4x4); \
+        tmp_ref_ptr[length + i] = tmpDst0.x; \
+    } \
+}
+SCATTER_REDUCTION_PREPROCESS(U8,  vxc_uchar8, vxc_uchar8, 8,  1, uchar)
+SCATTER_REDUCTION_PREPROCESS(I8,  vxc_char8,  vxc_char8,  8,  1, char)
+SCATTER_REDUCTION_PREPROCESS(I16, vxc_short8, vxc_short8, 16, 2, short)
+SCATTER_REDUCTION_PREPROCESS(F16, vxc_short8, vxc_half8,  16, 2, short)
+
+#define SCATTER_ND_REDUCTION_PROCESS_F16(name0, func) \
+__kernel void scatter_nd_update_reduction_##name0##_F16( \
+    __read_only image2d_t   index, \
+    __read_only image2d_t   update, \
+    image2d_t  temp_buf_float, \
+    image2d_t  link_buffer0, \
+    int width, int area, int vol, int val4, \
+    int val5, int val6, int val7, int coord_dim) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    Image img1 = create_image_from_image2d(index, 4); \
+    Image img2 = create_image_from_image2d(update, 2); \
+    Image img3 = create_image_from_image2d(temp_buf_float, 4); \
+    __global int* index_ptr = (__global int*)img1.ptr; \
+    __global short* update_ptr = (__global short*)img2.ptr; \
+    __global float* output_ptr = (__global float*)img3.ptr; \
+    half src; \
+ \
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim); \
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \
+    short tmpData = update_ptr[gidy * update_width + gidx]; \
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \
+    int loc = idx * output_width + gidx; \
+    _viv_asm(COPY, src, tmpData, 4); \
+    float data; \
+    _viv_asm(CONV, data, src); \
+    func(output_ptr + loc, data); \
+}
+SCATTER_ND_REDUCTION_PROCESS_F16(Add,  AtomicAdd_float)
+SCATTER_ND_REDUCTION_PROCESS_F16(Mul,  AtomicMul_float)
+SCATTER_ND_REDUCTION_PROCESS_F16(Max,  AtomicMax_float)
+SCATTER_ND_REDUCTION_PROCESS_F16(Min,  AtomicMin_float)
+
+#define SCATTER_ND_REDUCTION_PROCESS_BF16(name0, func) \
+__kernel void scatter_nd_update_reduction_##name0##_BF16( \
+    __read_only image2d_t   index, \
+    __read_only image2d_t   update, \
+    image2d_t  temp_buf_float, \
+    image2d_t  link_buffer0, \
+    int width, int area, int vol, int val4, \
+    int val5, int val6, int val7, int coord_dim) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    Image img1 = create_image_from_image2d(index, 4); \
+    Image img2 = create_image_from_image2d(update, 2); \
+    Image img3 = create_image_from_image2d(temp_buf_float, 4); \
+    __global int* index_ptr = (__global int*)img1.ptr; \
+    __global short* update_ptr = (__global short*)img2.ptr; \
+    __global float* output_ptr = (__global float*)img3.ptr; \
+    half src; \
+ \
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim); \
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \
+    short tmpData = update_ptr[gidy * update_width + gidx]; \
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \
+    vxc_short8 src0, src1; \
+    float data; \
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \
+    int loc = idx * output_width + gidx; \
+    _viv_asm(COPY, src0, tmpData, 4); \
+    VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \
+                        uniConvBF16toF32_Part0_2x8); \
+    _viv_asm(COPY, data, src1, 4); \
+    func(output_ptr + loc, data); \
+}
+SCATTER_ND_REDUCTION_PROCESS_BF16(Add,  AtomicAdd_float)
+SCATTER_ND_REDUCTION_PROCESS_BF16(Mul,  AtomicMul_float)
+SCATTER_ND_REDUCTION_PROCESS_BF16(Max,  AtomicMax_float)
+SCATTER_ND_REDUCTION_PROCESS_BF16(Min,  AtomicMin_float)
+
+#define SCATTER_ND_UPDATE_PROCESS_QINT(name0, src0_type, data_type, ptr_type, element_size, func) \
+__kernel void scatter_nd_update_reduction_##name0##_##src0_type( \
+    __read_only image2d_t   index, \
+    __read_only image2d_t   update, \
+    image2d_t  temp_buf_float, \
+    image2d_t  link_buffer0, \
+    int width, int area, int vol, int val4, \
+    int val5, int val6, int val7, int coord_dim) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    Image img1 = create_image_from_image2d(index, 4); \
+    Image img2 = create_image_from_image2d(update, element_size); \
+    Image img3 = create_image_from_image2d(temp_buf_float, 4); \
+    __global int* index_ptr = (__global int*)img1.ptr; \
+    __global ptr_type* update_ptr = (__global ptr_type*)img2.ptr; \
+    __global float* output_ptr = (__global float*)img3.ptr; \
+    data_type src; \
+ \
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim); \
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \
+    ptr_type tmpData = update_ptr[gidy * update_width + gidx]; \
+    short zp = update_zp; \
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \
+    int loc = idx * output_width + gidx; \
+    _viv_asm(COPY, src, tmpData, 4); \
+    vxc_float4 data; \
+    VXC_DP4x4(data, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+                    uniConvert1stUint8SubZpToFp32_4x4); \
+    data.x *= update_scale; \
+    func(output_ptr + loc, data.x); \
+}
+SCATTER_ND_UPDATE_PROCESS_QINT(Add, U8,  vxc_uchar8, uchar, 1, AtomicAdd_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Mul, U8,  vxc_uchar8, uchar, 1, AtomicMul_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Max, U8,  vxc_uchar8, uchar, 1, AtomicMax_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Min, U8,  vxc_uchar8, uchar, 1, AtomicMin_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Add, I8,  vxc_char8,  char,  1, AtomicAdd_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Mul, I8,  vxc_char8,  char,  1, AtomicMul_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Max, I8,  vxc_char8,  char,  1, AtomicMax_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Min, I8,  vxc_char8,  char,  1, AtomicMin_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Add, I16, vxc_short8, short, 2, AtomicAdd_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Mul, I16, vxc_short8, short, 2, AtomicMul_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Max, I16, vxc_short8, short, 2, AtomicMax_float)
+SCATTER_ND_UPDATE_PROCESS_QINT(Min, I16, vxc_short8, short, 2, AtomicMin_float)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_reduction_conv.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_reduction_conv.vx
new file mode 100644
index 0000000..e027a2f
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_reduction_conv.vx
@@ -0,0 +1,110 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+_viv_uniform VXC_512Bits uniExtractHalf8_2x8;
+
+#define SCATTER_ND_UPDATE_CONV(src0_type, ptr_type, element_size, ptr_type1, conv_func) \
+__kernel void scatter_nd_update_reduction_conv_##src0_type( \
+    __read_only image2d_t  temp_buf_float, \
+    __read_only image2d_t  link_buf, \
+    image2d_t  output, \
+    int length, int res) \
+{ \
+    int gidx = get_global_id(0); \
+    Image img1 = create_image_from_image2d(temp_buf_float, 4); \
+    Image img2 = create_image_from_image2d(output, element_size); \
+    __global float* input_ptr = (__global float*)img1.ptr; \
+    if(length > 0) \
+    { \
+        __global ptr_type* output_ptr = (__global ptr_type*)img2.ptr; \
+        float4 src0 = vload4(0, input_ptr + gidx * 8); \
+        float4 src1 = vload4(1, input_ptr + gidx * 8); \
+        int4 data0 = convert_int4_rte(src0 * output_scale + output_zp); \
+        int4 data1 = convert_int4_rte(src1 * output_scale + output_zp); \
+        ptr_type dst; \
+        VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                    uniConvertInt32toUint8_2x8); \
+        output_ptr[gidx] = dst; \
+    } \
+    __global ptr_type1* output_ptr1 = (__global ptr_type1*)img2.ptr; \
+    for(int i = gidx; i < res; i += get_global_size(0)) \
+    { \
+        float src = input_ptr[length + i]; \
+        int data = convert_int_rte(src * output_scale + output_zp); \
+        output_ptr1[length + i] = conv_func(data); \
+    } \
+}
+SCATTER_ND_UPDATE_CONV(U8,  vxc_uchar8, 1, uchar, convert_uchar)
+SCATTER_ND_UPDATE_CONV(I8,  vxc_char8,  1, char,  convert_char)
+SCATTER_ND_UPDATE_CONV(I16, vxc_short8, 2, short, convert_short)
+
+__kernel void scatter_nd_update_reduction_conv_F16(
+    __read_only image2d_t  temp_buf_float,
+    __read_only image2d_t  link_buf,
+    image2d_t  output,
+    int length, int res)
+{
+    int gidx = get_global_id(0);
+    Image img1 = create_image_from_image2d(temp_buf_float, 4);
+    Image img2 = create_image_from_image2d(output, 2);
+    __global float* input_ptr = (__global float*)img1.ptr;
+    if(length > 0)
+    {
+        __global vxc_short8* output_ptr = (__global vxc_short8*)img2.ptr;
+        float4 src0 = vload4(0, input_ptr + gidx * 8);
+        float4 src1 = vload4(1, input_ptr + gidx * 8);
+        half4 data0, data1;
+        _viv_asm(CONV, data0, src0);
+        _viv_asm(CONV, data1, src1);
+        vxc_half8 tmp;
+        vxc_short8 dst;
+        VXC_DP2x8(tmp, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractHalf8_2x8);
+        _viv_asm(COPY, dst, tmp, 16);
+        output_ptr[gidx] = dst;
+    }
+    __global short* output_ptr1 = (__global short*)img2.ptr;
+    for(int i = gidx; i < res; i += get_global_size(0))
+    {
+        float src = input_ptr[length + i];
+        half data;
+        _viv_asm(CONV, data, src);
+        short dst;
+        _viv_asm(COPY, dst, data, 4);
+        output_ptr1[length + i] = dst;
+    }
+}
+
+__kernel void scatter_nd_update_reduction_conv_BF16(
+    __read_only image2d_t  temp_buf_float,
+    __read_only image2d_t  link_buf,
+    image2d_t  output,
+    int length, int res)
+{
+    int gidx = get_global_id(0);
+    Image img1 = create_image_from_image2d(temp_buf_float, 4);
+    Image img2 = create_image_from_image2d(output, 2);
+    __global float* input_ptr = (__global float*)img1.ptr;
+    if(length > 0)
+    {
+        __global vxc_short8* output_ptr = (__global vxc_short8*)img2.ptr;
+        float4 src0 = vload4(0, input_ptr + gidx * 8);
+        float4 src1 = vload4(1, input_ptr + gidx * 8);
+        vxc_short8 dst0, dst1, dst;
+        _viv_asm(COPY, dst0, src0, 16);
+        _viv_asm(COPY, dst1, src1, 16);
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+        output_ptr[gidx] = dst;
+    }
+    __global short* output_ptr1 = (__global short*)img2.ptr;
+    for(int i = gidx; i < res; i += get_global_size(0))
+    {
+        float src = input_ptr[length + i];
+        vxc_short8 data;
+        _viv_asm(COPY, data, src, 4);
+        output_ptr1[length + i] = data.x;
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx
index 8f47577..c9dda26 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx
@@ -110,10 +110,10 @@ do\
 #define VXC_VertMax3_Integer(dst, src0, src1, src2, info)\
 do\
 {\
-    int startBin     = (info & VXC_START_BIN_BITMASK) >> 12;\
-    int endBin         = (info & VXC_END_BIN_BITMASK) >> 8;\
-    int sourceBin     = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\
-    int mod1 = VXC_MODIFIER_CLAMP(startBin, endBin, sourceBin, 0);\
+    constant int startBin     = (info & VXC_START_BIN_BITMASK) >> 12;\
+    constant int endBin       = (info & VXC_END_BIN_BITMASK) >> 8;\
+    constant int sourceBin    = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\
+    constant int mod1 = VXC_MODIFIER_CLAMP(startBin, endBin, sourceBin, 0);\
     typeof (dst) tmp;\
     tmp = max(src0, src1);\
     tmp = max(src2, tmp);\
@@ -138,10 +138,10 @@ do\
 #define VXC_HorzMax3_Integer(dst, src0, info)\
 do\
 {\
-    int startBin     = (info & VXC_START_BIN_BITMASK) >> 12;\
-    int endBin         = (info & VXC_END_BIN_BITMASK) >> 8;\
-    int sourceBin     = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\
-    int clamp         = (info & VXC_CLAMP_BITMASK) >> 22;\
+    constant int startBin     = (info & VXC_START_BIN_BITMASK) >> 12;\
+    constant int endBin       = (info & VXC_END_BIN_BITMASK) >> 8;\
+    constant int sourceBin    = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\
+    constant int clamp        = (info & VXC_CLAMP_BITMASK) >> 22;\
     int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\
     VXC_OP4(filter, dst, src0, src0, src0, mod1);\
 } while (0)
@@ -149,12 +149,12 @@ do\
 #define VXC_HorzMax3_Half(dst, src0, info)\
 do\
 {\
-    int startBin     = (info & VXC_START_BIN_BITMASK) >> 12;\
-    int endBin         = (info & VXC_END_BIN_BITMASK) >> 8;\
-    int sourceBin     = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\
-    int clamp         = (info & VXC_CLAMP_BITMASK) >> 22;\
-    int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\
-    int mod2 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\
+    constant int startBin    = (info & VXC_START_BIN_BITMASK) >> 12;\
+    constant int endBin      = (info & VXC_END_BIN_BITMASK) >> 8;\
+    constant int sourceBin   = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\
+    constant int clamp       = (info & VXC_CLAMP_BITMASK) >> 22;\
+    constant int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\
+    constant int mod2 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\
     vxc_short8 val0, minVal, maxVal;\
     _viv_asm(COPY, val0, src0, 16);\
     VXC_OP4(filter, maxVal, val0, val0, val0, mod1);\
@@ -166,24 +166,24 @@ do\
 #define VXC_HorzMin3_Integer(dst, src0, info)\
 do\
 {\
-    int startBin     = (info & VXC_START_BIN_BITMASK) >> 12;\
-    int endBin         = (info & VXC_END_BIN_BITMASK) >> 8;\
-    int sourceBin     = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\
-    int clamp         = (info & VXC_CLAMP_BITMASK) >> 22;\
-    int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\
+    constant int startBin     = (info & VXC_START_BIN_BITMASK) >> 12;\
+    constant int endBin       = (info & VXC_END_BIN_BITMASK) >> 8;\
+    constant int sourceBin    = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\
+    constant int clamp        = (info & VXC_CLAMP_BITMASK) >> 22;\
+    constant int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\
     VXC_OP4(filter, dst, src0, src0, src0, mod1);\
 } while (0)
 
 #define VXC_HorzMin3_Half(dst, src0, info)\
 do\
 {\
-    int startBin     = (info & VXC_START_BIN_BITMASK) >> 12;\
-    int endBin         = (info & VXC_END_BIN_BITMASK) >> 8;\
-    int sourceBin     = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\
-    int clamp         = (info & VXC_CLAMP_BITMASK) >> 22;\
-    int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\
-    int mod2 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\
-    int mod3 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Median, clamp);\
+    constant int startBin     = (info & VXC_START_BIN_BITMASK) >> 12;\
+    constant int endBin       = (info & VXC_END_BIN_BITMASK) >> 8;\
+    constant int sourceBin    = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\
+    constant int clamp        = (info & VXC_CLAMP_BITMASK) >> 22;\
+    constant int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\
+    constant int mod2 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\
+    constant int mod3 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Median, clamp);\
     vxc_short8 val0, minVal, maxVal, midVal;\
     _viv_asm(COPY, val0, src0, 16);\
     VXC_OP4(filter, maxVal, val0, val0, val0, mod1);\
diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
index dd10737..68763cc 100644
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
@@ -4242,6 +4242,557 @@ __kernel void conv1d_U8U8I32toU8_K1024_LARGE(\n\
 \n\
 "; /* end of conv1d_ovxlib_k1024_vx*/
 
+static const char crop_and_resize_bilinear_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float inOutScale;\n\
+_viv_uniform float inOutTile;\n\
+_viv_uniform float width_scale;\n\
+_viv_uniform float height_scale;\n\
+_viv_uniform int   image_width;\n\
+_viv_uniform int   image_height;\n\
+_viv_uniform VXC_512Bits uniRightToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniLeftToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Bit_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractHalf8_2x8;\n\
+\n\
+#define CROP_AND_RESIZE_PART0 \\\n\
+    int i = 0; \\\n\
+    int bb = get_global_id(2); \\\n\
+    int y =  get_global_id(1); \\\n\
+    int4 x = (int4)(get_global_id(0),get_global_id(0) + 1, get_global_id(0) + 2, get_global_id(0) + 3); \\\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    int2 coord_box_ind = (int2)(bb, 0); \\\n\
+    int b = read_imagei(box_ind, coord_box_ind).x; \\\n\
+    float4 xy, in_x; \\\n\
+    float in_y; \\\n\
+    float4 x_lerp, y_lerp; \\\n\
+    int d = 0; \\\n\
+    Image img_boxes = create_image_from_image2d(boxes, 2); \\\n\
+    __global half* boxes_ptr = (__global half*)img_boxes.ptr; \\\n\
+    xy = vload_half4(bb, boxes_ptr); \\\n\
+    float _width_scale = convert_float(xy.w - xy.y) * width_scale; \\\n\
+    float _height_scale = convert_float(xy.z - xy.x) * height_scale; \\\n\
+ \\\n\
+    if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \\\n\
+    if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \\\n\
+    in_y = xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale; \\\n\
+    y_lerp.x = in_y - floor(in_y); \\\n\
+    y_lerp.yzw = y_lerp.xxx;\n\
+\n\
+#define CROP_AND_RESIZE_PART1 \\\n\
+        int4 coord = (int4)(0, in_y, d + b * ori_depth, 0); \\\n\
+        int8 input_desc, output_desc; \\\n\
+ \\\n\
+        coord_out.z = d + coord_out.z * ori_depth; \\\n\
+ \\\n\
+        _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+        int baseAddr = (int)coord.z * input_desc.s4 + input_desc.s0; \\\n\
+        _viv_asm(MOV, coord.w, baseAddr); \\\n\
+ \\\n\
+        _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+        baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+        _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+        in_x.x = xy.y * convert_float(image_width - 1); \\\n\
+        in_x.yzw = in_x.xxx; \\\n\
+        in_x = in_x + convert_float4(x) * _width_scale; \\\n\
+        x_lerp = in_x - floor(in_x); \\\n\
+        coord.x = floor(in_x.x); \\\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                                         VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, src1, input, coord.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+                                     VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.x = floor(in_x.y); \\\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                                         VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, src1, input, coord.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+                                     VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.x = floor(in_x.z); \\\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                                         VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, src1, input, coord.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+                                     VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.x = floor(in_x.w); \\\n\
+        VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                                         VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, src1, input, coord.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+                                     VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+#define CROP_AND_RESIZE_BILINEAR_Quant8toQuant8(name,src_type,dst_type) \\\n\
+__kernel void crop_and_resize_bilinear_##name \\\n\
+( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __read_only image2d_t         boxes, \\\n\
+    __read_only image2d_t         box_ind, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 uint             ori_depth, \\\n\
+                 uint             ori_batchout \\\n\
+) \\\n\
+{ \\\n\
+    CROP_AND_RESIZE_PART0; \\\n\
+    for (d = 0; d < ori_depth; d++) \\\n\
+    { \\\n\
+        src_type src0, src1; \\\n\
+        CROP_AND_RESIZE_PART1; \\\n\
+ \\\n\
+        float4 top, bottom, value; \\\n\
+        float4 top_left4,top_right4,bottom_left4,bottom_right4; \\\n\
+        dst_type data; \\\n\
+        int4 tmpout; \\\n\
+ \\\n\
+        VXC_DP4x4(top_left4, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \\\n\
+        VXC_DP4x4(top_right4, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \\\n\
+        VXC_DP4x4(bottom_left4, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \\\n\
+        VXC_DP4x4(bottom_right4, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \\\n\
+ \\\n\
+        top = top_left4 + (top_right4 - top_left4) * x_lerp; \\\n\
+        bottom = bottom_left4 + (bottom_right4 - bottom_left4) * x_lerp; \\\n\
+        value = top + (bottom - top) * y_lerp; \\\n\
+        value = value * inOutScale + inOutTile; \\\n\
+        _viv_asm(CONV, tmpout, value); \\\n\
+        VXC_DP2x8(data, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+CROP_AND_RESIZE_BILINEAR_Quant8toQuant8(U8toU8,vxc_uchar8, vxc_uchar4)\n\
+CROP_AND_RESIZE_BILINEAR_Quant8toQuant8(I8toI8,vxc_char8, vxc_char4)\n\
+\n\
+#define CROP_AND_RESIZE_BILINEAR_16Bitsto16Bits(name,src_type,dst_type,tmp_type) \\\n\
+__kernel void crop_and_resize_bilinear_##name \\\n\
+( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __read_only image2d_t         boxes, \\\n\
+    __read_only image2d_t         box_ind, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 uint             ori_depth, \\\n\
+                 uint             ori_batchout \\\n\
+) \\\n\
+{ \\\n\
+    CROP_AND_RESIZE_PART0; \\\n\
+    for (d = 0; d < ori_depth; d++) \\\n\
+    { \\\n\
+        vxc_short8 src0, src1; \\\n\
+        src_type src0_temp, src1_temp; \\\n\
+        CROP_AND_RESIZE_PART1; \\\n\
+ \\\n\
+        _viv_asm(COPY, src0_temp, src0, 16); \\\n\
+        _viv_asm(COPY, src1_temp, src1, 16); \\\n\
+        float4 top, bottom, value; \\\n\
+        float4 top_left4,top_right4,bottom_left4,bottom_right4; \\\n\
+        dst_type data; \\\n\
+        vxc_short4 out; \\\n\
+        tmp_type tmpout; \\\n\
+ \\\n\
+        VXC_DP4x4(top_left4, src0_temp, src0_temp, \\\n\
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \\\n\
+        VXC_DP4x4(top_right4, src0_temp, src0_temp, \\\n\
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \\\n\
+        VXC_DP4x4(bottom_left4, src1_temp, src1_temp, \\\n\
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \\\n\
+        VXC_DP4x4(bottom_right4, src1_temp, src1_temp, \\\n\
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \\\n\
+ \\\n\
+        top = top_left4 + (top_right4 - top_left4) * x_lerp; \\\n\
+        bottom = bottom_left4 + (bottom_right4 - bottom_left4) * x_lerp; \\\n\
+        value = top + (bottom - top) * y_lerp; \\\n\
+        value = value * inOutScale + inOutTile; \\\n\
+        _viv_asm(CONV, tmpout, value); \\\n\
+        VXC_DP2x8(data, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \\\n\
+        _viv_asm(COPY, out, data, 8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+CROP_AND_RESIZE_BILINEAR_16Bitsto16Bits(I16toI16, vxc_short8, vxc_short4, short4)\n\
+CROP_AND_RESIZE_BILINEAR_16Bitsto16Bits(I16toF16, vxc_short8, vxc_half4,  half4)\n\
+CROP_AND_RESIZE_BILINEAR_16Bitsto16Bits(F16toF16, vxc_half8,  vxc_half4,  half4)\n\
+CROP_AND_RESIZE_BILINEAR_16Bitsto16Bits(F16toI16, vxc_half8,  vxc_short4, short4)\n\
+\n\
+#define CROP_AND_RESIZE_BILINEAR_F16toQuant8(name,dst_type) \\\n\
+__kernel void crop_and_resize_bilinear_F16to##name \\\n\
+( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __read_only image2d_t         boxes, \\\n\
+    __read_only image2d_t         box_ind, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 uint             ori_depth, \\\n\
+                 uint             ori_batchout \\\n\
+) \\\n\
+{ \\\n\
+    CROP_AND_RESIZE_PART0; \\\n\
+    for (d = 0; d < ori_depth; d++) \\\n\
+    { \\\n\
+        vxc_short8 src0, src1; \\\n\
+        vxc_half8 src0_temp, src1_temp; \\\n\
+        CROP_AND_RESIZE_PART1; \\\n\
+ \\\n\
+        _viv_asm(COPY, src0_temp, src0, 16); \\\n\
+        _viv_asm(COPY, src1_temp, src1, 16); \\\n\
+        float4 top, bottom, value; \\\n\
+        float4 top_left4,top_right4,bottom_left4,bottom_right4; \\\n\
+        dst_type data; \\\n\
+        int4 tmpout; \\\n\
+ \\\n\
+        VXC_DP4x4(top_left4, src0_temp, src0_temp, \\\n\
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \\\n\
+        VXC_DP4x4(top_right4, src0_temp, src0_temp, \\\n\
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \\\n\
+        VXC_DP4x4(bottom_left4, src1_temp, src1_temp, \\\n\
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \\\n\
+        VXC_DP4x4(bottom_right4, src1_temp, src1_temp, \\\n\
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \\\n\
+ \\\n\
+        top = top_left4 + (top_right4 - top_left4) * x_lerp; \\\n\
+        bottom = bottom_left4 + (bottom_right4 - bottom_left4) * x_lerp; \\\n\
+        value = top + (bottom - top) * y_lerp; \\\n\
+        value = value * inOutScale + inOutTile; \\\n\
+        _viv_asm(CONV, tmpout, value); \\\n\
+        VXC_DP2x8(data, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, data, \\\n\
+                                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+CROP_AND_RESIZE_BILINEAR_F16toQuant8(U8, vxc_uchar4)\n\
+CROP_AND_RESIZE_BILINEAR_F16toQuant8(I8, vxc_char4)\n\
+\n\
+#define CROP_AND_RESIZE_BILINEAR_Quant8toF16(name,src_type) \\\n\
+__kernel void crop_and_resize_bilinear_##name##toF16 \\\n\
+( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __read_only image2d_t         boxes, \\\n\
+    __read_only image2d_t         box_ind, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 uint             ori_depth, \\\n\
+                 uint             ori_batchout \\\n\
+) \\\n\
+{ \\\n\
+    CROP_AND_RESIZE_PART0; \\\n\
+    for (d = 0; d < ori_depth; d++) \\\n\
+    { \\\n\
+        src_type src0, src1; \\\n\
+        CROP_AND_RESIZE_PART1; \\\n\
+ \\\n\
+        float4 top, bottom, value; \\\n\
+        float4 top_left4,top_right4,bottom_left4,bottom_right4; \\\n\
+        vxc_half4 data; \\\n\
+        vxc_short4 out; \\\n\
+        half4 tmpout; \\\n\
+ \\\n\
+        VXC_DP4x4(top_left4, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \\\n\
+        VXC_DP4x4(top_right4, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \\\n\
+        VXC_DP4x4(bottom_left4, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniLeftToFp32_4x4); \\\n\
+        VXC_DP4x4(bottom_right4, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightToFp32_4x4); \\\n\
+ \\\n\
+        top = top_left4 + (top_right4 - top_left4) * x_lerp; \\\n\
+        bottom = bottom_left4 + (bottom_right4 - bottom_left4) * x_lerp; \\\n\
+        value = top + (bottom - top) * y_lerp; \\\n\
+        value = value * inOutScale + inOutTile; \\\n\
+        _viv_asm(CONV, tmpout, value); \\\n\
+        VXC_DP2x8(data, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \\\n\
+        _viv_asm(COPY, out, data, 8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+CROP_AND_RESIZE_BILINEAR_Quant8toF16(U8, vxc_uchar8)\n\
+CROP_AND_RESIZE_BILINEAR_Quant8toF16(I8, vxc_char8)\n\
+\n\
+\n\
+"; /* end of crop_and_resize_bilinear_vx*/
+
+static const char crop_and_resize_nearest_neighbor_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float inOutScale;\n\
+_viv_uniform float inOutTile;\n\
+_viv_uniform float width_scale;\n\
+_viv_uniform float height_scale;\n\
+_viv_uniform int   image_width;\n\
+_viv_uniform int   image_height;\n\
+_viv_uniform VXC_512Bits uniConvertFstToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertSecToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Bit_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractHalf8_2x8;\n\
+\n\
+#define IMG_LOAD(src_type) \\\n\
+        src_type src; \\\n\
+        int4 coord = (int4)(0, in_y, d + b * ori_depth, 0); \\\n\
+        int8 input_desc, output_desc; \\\n\
+ \\\n\
+        coord_out.z = d + coord_out.z * ori_depth; \\\n\
+ \\\n\
+        _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+        int baseAddr = (int)coord.z * input_desc.s4 + input_desc.s0; \\\n\
+        _viv_asm(MOV, coord.w, baseAddr); \\\n\
+ \\\n\
+        _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+        baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+        _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+ \\\n\
+        in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x) * _width_scale)); \\\n\
+        coord.x = in_x; \\\n\
+        VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                                         VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+        in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 1) * _width_scale)); \\\n\
+        coord.x = in_x; \\\n\
+        VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                                         VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+        in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 2) * _width_scale)); \\\n\
+        coord.x = in_x; \\\n\
+        VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                                         VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+        in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 3) * _width_scale)); \\\n\
+        coord.x = in_x; \\\n\
+        VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                                         VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 4) * _width_scale)); \\\n\
+        coord.x = in_x; \\\n\
+        VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                                         VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \\\n\
+        in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 5) * _width_scale)); \\\n\
+        coord.x = in_x; \\\n\
+        VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                                         VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+        in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 6) * _width_scale)); \\\n\
+        coord.x = in_x; \\\n\
+        VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                                         VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \\\n\
+        in_x = convert_int(round(xy.y * convert_float(image_width - 1) + convert_float(x + 7) * _width_scale)); \\\n\
+        coord.x = in_x; \\\n\
+        VXC_OP4(img_load_3d, src, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                                         VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+\n\
+#define CROP_AND_RESIZE_Quant8toQuant8(name, data_type) \\\n\
+__kernel void crop_and_resize_nearest_neighbor_##name \\\n\
+( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __read_only image2d_t         boxes, \\\n\
+    __read_only image2d_t         box_ind, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 uint             ori_depth, \\\n\
+                 uint             ori_batchout \\\n\
+) \\\n\
+{ \\\n\
+    int bb = get_global_id(2); \\\n\
+    int y =  get_global_id(1); \\\n\
+    int x = get_global_id(0); \\\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    int2 coord_box_ind = (int2)(bb, 0); \\\n\
+    int b = read_imagei(box_ind, coord_box_ind).x; \\\n\
+    float4 xy; \\\n\
+    int in_x, in_y; \\\n\
+    int d = 0; \\\n\
+    Image img_boxes = create_image_from_image2d(boxes, 2); \\\n\
+    __global half* boxes_ptr = (__global half*)img_boxes.ptr; \\\n\
+    xy = vload_half4(bb, boxes_ptr); \\\n\
+    float _width_scale = convert_float(xy.w - xy.y) * width_scale; \\\n\
+    float _height_scale = convert_float(xy.z - xy.x) * height_scale; \\\n\
+ \\\n\
+    if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \\\n\
+    if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \\\n\
+    in_y = convert_int(round(xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale)); \\\n\
+ \\\n\
+    for (d = 0; d < ori_depth; d++) \\\n\
+    { \\\n\
+        data_type data; \\\n\
+        int4 tmpout0, tmpout1; \\\n\
+        float4 tmpdata0, tmpdata1; \\\n\
+        IMG_LOAD(data_type); \\\n\
+ \\\n\
+        VXC_DP4x4(tmpdata0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \\\n\
+        VXC_DP4x4(tmpdata1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \\\n\
+ \\\n\
+        tmpdata0 = tmpdata0 * inOutScale + inOutTile; \\\n\
+        tmpdata1 = tmpdata1 * inOutScale + inOutTile; \\\n\
+        _viv_asm(CONV, tmpout0, tmpdata0); \\\n\
+        _viv_asm(CONV, tmpout1, tmpdata1); \\\n\
+ \\\n\
+        VXC_DP2x8(data, tmpout0, tmpout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \\\n\
+ \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \\\n\
+                                     data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+CROP_AND_RESIZE_Quant8toQuant8(U8toU8, vxc_uchar8)\n\
+CROP_AND_RESIZE_Quant8toQuant8(I8toI8, vxc_char8)\n\
+\n\
+#define CROP_AND_RESIZE_Quant8toF16(name, src_type) \\\n\
+__kernel void crop_and_resize_nearest_neighbor_##name##toF16 \\\n\
+( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __read_only image2d_t         boxes, \\\n\
+    __read_only image2d_t         box_ind, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 uint             ori_depth, \\\n\
+                 uint             ori_batchout \\\n\
+) \\\n\
+{ \\\n\
+    int bb = get_global_id(2); \\\n\
+    int y =  get_global_id(1); \\\n\
+    int x = get_global_id(0); \\\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    int2 coord_box_ind = (int2)(bb, 0); \\\n\
+    int b = read_imagei(box_ind, coord_box_ind).x; \\\n\
+    float4 xy; \\\n\
+    int in_x, in_y; \\\n\
+    int d = 0; \\\n\
+    Image img_boxes = create_image_from_image2d(boxes, 2); \\\n\
+    __global half* boxes_ptr = (__global half*)img_boxes.ptr; \\\n\
+    xy = vload_half4(bb, boxes_ptr); \\\n\
+    float _width_scale = convert_float(xy.w - xy.y) * width_scale; \\\n\
+    float _height_scale = convert_float(xy.z - xy.x) * height_scale; \\\n\
+ \\\n\
+    if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \\\n\
+    if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \\\n\
+    in_y = convert_int(round(xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale)); \\\n\
+ \\\n\
+    for (d = 0; d < ori_depth; d++) \\\n\
+    { \\\n\
+        vxc_short8 out; \\\n\
+        vxc_half8 data; \\\n\
+        half4 tmpout0, tmpout1; \\\n\
+        float4 tmpdata0, tmpdata1; \\\n\
+        IMG_LOAD(src_type); \\\n\
+ \\\n\
+        VXC_DP4x4(tmpdata0, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \\\n\
+        VXC_DP4x4(tmpdata1, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \\\n\
+ \\\n\
+        tmpdata0 = tmpdata0 * inOutScale + inOutTile; \\\n\
+        tmpdata1 = tmpdata1 * inOutScale + inOutTile; \\\n\
+        _viv_asm(CONV, tmpout0, tmpdata0); \\\n\
+        _viv_asm(CONV, tmpout1, tmpdata1); \\\n\
+ \\\n\
+        VXC_DP2x8(data, tmpout0, tmpout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \\\n\
+        _viv_asm(COPY, out, data, 16); \\\n\
+ \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, \\\n\
+                                     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+CROP_AND_RESIZE_Quant8toF16(U8, vxc_uchar8)\n\
+CROP_AND_RESIZE_Quant8toF16(I8, vxc_char8)\n\
+\n\
+#define CROP_AND_RESIZE_NEAREST_F16toQuant8(name, dst_type) \\\n\
+__kernel void crop_and_resize_nearest_neighbor_F16to##name \\\n\
+( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __read_only image2d_t         boxes, \\\n\
+    __read_only image2d_t         box_ind, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 uint             ori_depth, \\\n\
+                 uint             ori_batchout \\\n\
+) \\\n\
+{ \\\n\
+    int bb = get_global_id(2); \\\n\
+    int y =  get_global_id(1); \\\n\
+    int x = get_global_id(0); \\\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    int2 coord_box_ind = (int2)(bb, 0); \\\n\
+    int b = read_imagei(box_ind, coord_box_ind).x; \\\n\
+    float4 xy; \\\n\
+    int in_x, in_y; \\\n\
+    int d = 0; \\\n\
+    Image img_boxes = create_image_from_image2d(boxes, 2); \\\n\
+    __global half* boxes_ptr = (__global half*)img_boxes.ptr; \\\n\
+    xy = vload_half4(bb, boxes_ptr); \\\n\
+    float _width_scale = convert_float(xy.w - xy.y) * width_scale; \\\n\
+    float _height_scale = convert_float(xy.z - xy.x) * height_scale; \\\n\
+ \\\n\
+    if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \\\n\
+    if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \\\n\
+    in_y = convert_int(round(xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale)); \\\n\
+ \\\n\
+    for (d = 0; d < ori_depth; d++) \\\n\
+    { \\\n\
+        dst_type data; \\\n\
+        int4 tmpout0, tmpout1; \\\n\
+        float4 tmpdata0, tmpdata1; \\\n\
+        IMG_LOAD(vxc_short8); \\\n\
+        vxc_half8 src_half; \\\n\
+        _viv_asm(COPY, src_half, src, 16); \\\n\
+ \\\n\
+        VXC_DP4x4(tmpdata0, src_half, src_half, \\\n\
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \\\n\
+        VXC_DP4x4(tmpdata1, src_half, src_half, \\\n\
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \\\n\
+ \\\n\
+        tmpdata0 = tmpdata0 * inOutScale + inOutTile; \\\n\
+        tmpdata1 = tmpdata1 * inOutScale + inOutTile; \\\n\
+        _viv_asm(CONV, tmpout0, tmpdata0); \\\n\
+        _viv_asm(CONV, tmpout1, tmpdata1); \\\n\
+ \\\n\
+        VXC_DP2x8(data, tmpout0, tmpout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \\\n\
+ \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+CROP_AND_RESIZE_NEAREST_F16toQuant8(U8, vxc_uchar8)\n\
+CROP_AND_RESIZE_NEAREST_F16toQuant8(I8, vxc_char8)\n\
+\n\
+#define CROP_AND_RESIZE_16Bitsto16Bits(name,src_type,dst_type,temp_type) \\\n\
+__kernel void crop_and_resize_nearest_neighbor_##name \\\n\
+( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __read_only image2d_t         boxes, \\\n\
+    __read_only image2d_t         box_ind, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 uint             ori_depth, \\\n\
+                 uint             ori_batchout \\\n\
+) \\\n\
+{ \\\n\
+    int bb = get_global_id(2); \\\n\
+    int y =  get_global_id(1); \\\n\
+    int x = get_global_id(0); \\\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    int2 coord_box_ind = (int2)(bb, 0); \\\n\
+    int b = read_imagei(box_ind, coord_box_ind).x; \\\n\
+    float4 xy; \\\n\
+    int in_x, in_y; \\\n\
+    int d = 0; \\\n\
+    Image img_boxes = create_image_from_image2d(boxes, 2); \\\n\
+    __global half* boxes_ptr = (__global half*)img_boxes.ptr; \\\n\
+    xy = vload_half4(bb, boxes_ptr); \\\n\
+    float _width_scale = convert_float(xy.w - xy.y) * width_scale; \\\n\
+    float _height_scale = convert_float(xy.z - xy.x) * height_scale; \\\n\
+ \\\n\
+    if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \\\n\
+    if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \\\n\
+    in_y = convert_int(round(xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale)); \\\n\
+ \\\n\
+    for (d = 0; d < ori_depth; d++) \\\n\
+    { \\\n\
+        vxc_short8 out; \\\n\
+        dst_type data; \\\n\
+        temp_type tmpout0, tmpout1; \\\n\
+        float4 tmpdata0, tmpdata1; \\\n\
+        IMG_LOAD(vxc_short8); \\\n\
+        src_type src_temp; \\\n\
+        _viv_asm(COPY, src_temp, src, 16); \\\n\
+ \\\n\
+        VXC_DP4x4(tmpdata0, src_temp, src_temp, \\\n\
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFstToFp32_4x4); \\\n\
+        VXC_DP4x4(tmpdata1, src_temp, src_temp, \\\n\
+                  VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertSecToFp32_4x4); \\\n\
+ \\\n\
+        _viv_asm(CONV, tmpout0, tmpdata0); \\\n\
+        _viv_asm(CONV, tmpout1, tmpdata1); \\\n\
+ \\\n\
+        VXC_DP2x8(data, tmpout0, tmpout1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \\\n\
+        _viv_asm(COPY, out, data, 16); \\\n\
+ \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+CROP_AND_RESIZE_16Bitsto16Bits \\\n\
+(F16toF16, vxc_half8, vxc_half8,  half4)\n\
+CROP_AND_RESIZE_16Bitsto16Bits \\\n\
+(F16toI16, vxc_half8, vxc_short8, short4)\n\
+CROP_AND_RESIZE_16Bitsto16Bits \\\n\
+(I16toF16, vxc_short8, vxc_half8, half4)\n\
+CROP_AND_RESIZE_16Bitsto16Bits \\\n\
+(I16toI16, vxc_short8, vxc_short8,short4)\n\
+"; /* end of crop_and_resize_nearest_neighbor_vx*/
+
 static const char cumsum_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\
@@ -9243,6 +9794,11 @@ float4 eltwise_unary_acosh(float4 val)\n\
     return acosh(val);\n\
 }\n\
 \n\
+float4 eltwise_unary_tan(float4 val)\n\
+{\n\
+    return native_tan(val);\n\
+}\n\
+\n\
 _viv_uniform float inputScale;\n\
 _viv_uniform float inputTail;\n\
 _viv_uniform float outputScale;\n\
@@ -9360,7 +9916,8 @@ ADD_ELTSISE_UNARY_2D(atan)\n\
 ADD_ELTSISE_UNARY_2D(atanh)\n\
 //ACOSH\n\
 ADD_ELTSISE_UNARY_2D(acosh)\n\
-\n\
+//TAN\n\
+ADD_ELTSISE_UNARY_2D(tan)\n\
 "; /* end of eltwise_unary_2d_1_vx*/
 
 static const char eltwise_unary_3d_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -9714,6 +10271,11 @@ float4 eltwise_unary_acosh(float4 val)\n\
     return acosh(val);\n\
 }\n\
 \n\
+float4 eltwise_unary_tan(float4 val)\n\
+{\n\
+    return native_tan(val);\n\
+}\n\
+\n\
 _viv_uniform float inputScale;\n\
 _viv_uniform float inputTail;\n\
 _viv_uniform float outputScale;\n\
@@ -9830,6 +10392,8 @@ ADD_ELTSISE_UNARY_3D(atan)\n\
 ADD_ELTSISE_UNARY_3D(atanh)\n\
 //ACOSH\n\
 ADD_ELTSISE_UNARY_3D(acosh)\n\
+//TAN\n\
+ADD_ELTSISE_UNARY_3D(tan)\n\
 "; /* end of eltwise_unary_3d_1_vx*/
 
 static const char erf_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -10262,7 +10826,8 @@ __kernel void gather_I8toI8(\n\
     __write_only image2d_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
@@ -10287,7 +10852,8 @@ __kernel void gather_U8toU8(\n\
     __write_only image2d_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
@@ -10312,7 +10878,8 @@ __kernel void gather_I16toI16(\n\
     __write_only image2d_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
@@ -10338,7 +10905,8 @@ __kernel void gather_F16toF16(\n\
     __write_only image2d_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
@@ -10363,7 +10931,8 @@ __kernel void gather_I8toI8_axis0(\n\
     __write_only image2d_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
@@ -10390,7 +10959,8 @@ __kernel void gather_U8toU8_axis0(\n\
     __write_only image2d_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
@@ -10417,7 +10987,8 @@ __kernel void gather_I16toI16_axis0(\n\
     __write_only image2d_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
@@ -10444,7 +11015,8 @@ __kernel void gather_F16toF16_axis0(\n\
     __write_only image2d_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
@@ -10479,7 +11051,8 @@ __kernel void gather_I8toI8_array(\n\
     __write_only image2d_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
@@ -10493,13 +11066,29 @@ __kernel void gather_I8toI8_array(\n\
 \n\
     Image img1 = create_image_from_image2d(input0, 1);\n\
     Image img2 = create_image_from_image2d(output, 1);\n\
-    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\
-    __global vxc_char16* data_ptr = (__global vxc_char16*)input_ptr;\n\
-    vxc_char16 src = data_ptr[0];\n\
+\n\
     int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\
+\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\
     uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\
-    __global vxc_char16* dst_ptr = (__global vxc_char16*)output_ptr;\n\
-    dst_ptr[0] = src;\n\
+\n\
+    if (gidx == ((block_size >> 4) * 16))\n\
+    {\n\
+        __global char* data_ptr = (__global char*)input_ptr;\n\
+        __global char* dst_ptr = (__global char*)output_ptr;\n\
+        int i = 0;\n\
+        for (i = 0; i < block_size - gidx; i ++)\n\
+        {\n\
+           dst_ptr[i] = data_ptr[i];\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        __global vxc_char16* data_ptr = (__global vxc_char16*)input_ptr;\n\
+        vxc_char16 src = data_ptr[0];\n\
+        __global vxc_char16* dst_ptr = (__global vxc_char16*)output_ptr;\n\
+        dst_ptr[0] = src;\n\
+    }\n\
 }\n\
 \n\
 __kernel void gather_U8toU8_array(\n\
@@ -10508,7 +11097,8 @@ __kernel void gather_U8toU8_array(\n\
     __write_only image2d_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
@@ -10522,13 +11112,29 @@ __kernel void gather_U8toU8_array(\n\
 \n\
     Image img1 = create_image_from_image2d(input0, 1);\n\
     Image img2 = create_image_from_image2d(output, 1);\n\
-    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\
-    __global vxc_uchar16* data_ptr = (__global vxc_uchar16*)input_ptr;\n\
-    vxc_uchar16 src = data_ptr[0];\n\
+\n\
     int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\
+\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\
     uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\
-    __global vxc_uchar16* dst_ptr = (__global vxc_uchar16*)output_ptr;\n\
-    dst_ptr[0] = src;\n\
+\n\
+    if (gidx == ((block_size >> 4) * 16))\n\
+    {\n\
+        __global uchar* data_ptr = (__global uchar*)input_ptr;\n\
+        __global uchar* dst_ptr = (__global uchar*)output_ptr;\n\
+        int i = 0;\n\
+        for (i = 0; i < block_size - gidx; i ++)\n\
+        {\n\
+           dst_ptr[i] = data_ptr[i];\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        __global vxc_uchar16* data_ptr = (__global vxc_uchar16*)input_ptr;\n\
+        vxc_uchar16 src = data_ptr[0];\n\
+        __global vxc_uchar16* dst_ptr = (__global vxc_uchar16*)output_ptr;\n\
+        dst_ptr[0] = src;\n\
+    }\n\
 }\n\
 \n\
 __kernel void gather_I16toI16_array(\n\
@@ -10537,7 +11143,8 @@ __kernel void gather_I16toI16_array(\n\
     __write_only image2d_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
@@ -10552,13 +11159,29 @@ __kernel void gather_I16toI16_array(\n\
 \n\
     Image img1 = create_image_from_image2d(input0, 2);\n\
     Image img2 = create_image_from_image2d(output, 2);\n\
-    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\
-    __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr;\n\
-    vxc_short8 src = data_ptr[0];\n\
+\n\
     int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\
+\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\
     uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\
-    __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr;\n\
-    dst_ptr[0] = src;\n\
+\n\
+    if (gidx == ((block_size >> 3) * 8))\n\
+    {\n\
+        __global short* data_ptr = (__global short*)input_ptr;\n\
+        __global short* dst_ptr = (__global short*)output_ptr;\n\
+        int i = 0;\n\
+        for (i = 0; i < block_size - gidx; i ++)\n\
+        {\n\
+           dst_ptr[i] = data_ptr[i];\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr;\n\
+        vxc_short8 src = data_ptr[0];\n\
+        __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr;\n\
+        dst_ptr[0] = src;\n\
+    }\n\
 }\n\
 \n\
 __kernel void gather_F16toF16_array(\n\
@@ -10567,7 +11190,8 @@ __kernel void gather_F16toF16_array(\n\
     __write_only image2d_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
@@ -10582,13 +11206,29 @@ __kernel void gather_F16toF16_array(\n\
 \n\
     Image img1 = create_image_from_image2d(input0, 2);\n\
     Image img2 = create_image_from_image2d(output, 2);\n\
-    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\
-    __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr;\n\
-    vxc_short8 src = data_ptr[0];\n\
+\n\
     int2 coord = (int2)(gidx, gidz * indices_num + gidy);\n\
+\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord_in.zw);\n\
     uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\
-    __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr;\n\
-    dst_ptr[0] = src;\n\
+\n\
+    if (gidx == ((block_size >> 3) * 8))\n\
+    {\n\
+        __global short* data_ptr = (__global short*)input_ptr;\n\
+        __global short* dst_ptr = (__global short*)output_ptr;\n\
+        int i = 0;\n\
+        for (i = 0; i < block_size - gidx; i ++)\n\
+        {\n\
+           dst_ptr[i] = data_ptr[i];\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr;\n\
+        vxc_short8 src = data_ptr[0];\n\
+        __global vxc_short8* dst_ptr = (__global vxc_short8*)output_ptr;\n\
+        dst_ptr[0] = src;\n\
+    }\n\
 }\n\
 \n\
 #define GATHER_AXIS0_ARRAY(src0_type_name, read_type, data_type, write_type) \\\n\
@@ -10598,7 +11238,8 @@ __kernel void gather_##src0_type_name##to##src0_type_name##_axis0_array( \\\n\
     __write_only image2d_t  output, \\\n\
     int block_size, \\\n\
     int block_num, \\\n\
-    int axis_num \\\n\
+    int axis_num, \\\n\
+    int is_array \\\n\
     ) \\\n\
 { \\\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\
@@ -10664,7 +11305,8 @@ __kernel void gather_batch_I8toI8(\n\
     __write_only image2d_array_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
@@ -10695,7 +11337,8 @@ __kernel void gather_batch_U8toU8(\n\
     __write_only image2d_array_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
@@ -10726,7 +11369,8 @@ __kernel void gather_batch_I16toI16(\n\
     __write_only image2d_array_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
@@ -10757,7 +11401,8 @@ __kernel void gather_batch_F16toF16(\n\
     __write_only image2d_array_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);  // block_size\n\
@@ -10788,7 +11433,8 @@ __kernel void gather_batch_I8toI8_axis0(\n\
     __write_only image2d_array_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
@@ -10817,7 +11463,8 @@ __kernel void gather_batch_U8toU8_axis0(\n\
     __write_only image2d_array_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
@@ -10846,7 +11493,8 @@ __kernel void gather_batch_I16toI16_axis0(\n\
     __write_only image2d_array_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
@@ -10875,7 +11523,8 @@ __kernel void gather_batch_F16toF16_axis0(\n\
     __write_only image2d_array_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
@@ -11215,7 +11864,8 @@ __kernel void gather_##src0_type_name##toF16( \\\n\
     __write_only image2d_t  output, \\\n\
     int block_size, \\\n\
     int block_num, \\\n\
-    int axis_num \\\n\
+    int axis_num, \\\n\
+    int is_array \\\n\
     ) \\\n\
 { \\\n\
     int gidx = get_global_id(0); \\\n\
@@ -11252,7 +11902,8 @@ __kernel void gather_F16to##src1_type_name( \\\n\
     __write_only image2d_t  output, \\\n\
     int block_size, \\\n\
     int block_num, \\\n\
-    int axis_num \\\n\
+    int axis_num, \\\n\
+    int is_array \\\n\
     ) \\\n\
 { \\\n\
     int gidx = get_global_id(0); \\\n\
@@ -11285,7 +11936,8 @@ __kernel void gather_I16toF16(\n\
     __write_only image2d_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);\n\
@@ -11320,7 +11972,8 @@ __kernel void gather_##src0_type_name##toF16_axis0( \\\n\
     __write_only image2d_t  output, \\\n\
     int block_size, \\\n\
     int block_num, \\\n\
-    int axis_num \\\n\
+    int axis_num, \\\n\
+    int is_array \\\n\
     ) \\\n\
 { \\\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
@@ -11352,7 +12005,8 @@ __kernel void gather_F16to##src1_type_name##_axis0( \\\n\
     __write_only image2d_t  output, \\\n\
     int block_size, \\\n\
     int block_num, \\\n\
-    int axis_num \\\n\
+    int axis_num, \\\n\
+    int is_array \\\n\
     ) \\\n\
 { \\\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
@@ -11384,7 +12038,8 @@ __kernel void gather_I16toF16_axis0(\n\
     __write_only image2d_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
@@ -11429,7 +12084,8 @@ __kernel void gather_batch_##src0_type_name##toF16( \\\n\
     __write_only image2d_t  output, \\\n\
     int block_size, \\\n\
     int block_num, \\\n\
-    int axis_num \\\n\
+    int axis_num, \\\n\
+    int is_array \\\n\
     ) \\\n\
 { \\\n\
     int gidx = get_global_id(0); \\\n\
@@ -11476,7 +12132,8 @@ __kernel void gather_batch_F16to##src1_type_name( \\\n\
     __write_only image2d_t  output, \\\n\
     int block_size, \\\n\
     int block_num, \\\n\
-    int axis_num \\\n\
+    int axis_num, \\\n\
+    int is_array \\\n\
     ) \\\n\
 { \\\n\
     int gidx = get_global_id(0); \\\n\
@@ -11517,7 +12174,8 @@ __kernel void gather_batch_I16toF16(\n\
     __write_only image2d_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int gidx = get_global_id(0);\n\
@@ -11556,7 +12214,8 @@ __kernel void gather_batch_##src0_type_name##toF16_axis0( \\\n\
     __write_only image2d_t  output, \\\n\
     int block_size, \\\n\
     int block_num, \\\n\
-    int axis_num \\\n\
+    int axis_num, \\\n\
+    int is_array \\\n\
     ) \\\n\
 { \\\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
@@ -11591,7 +12250,8 @@ __kernel void gather_batch_F16to##src1_type_name##_axis0( \\\n\
     __write_only image2d_t  output, \\\n\
     int block_size, \\\n\
     int block_num, \\\n\
-    int axis_num \\\n\
+    int axis_num, \\\n\
+    int is_array \\\n\
     ) \\\n\
 { \\\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
@@ -11626,7 +12286,8 @@ __kernel void gather_batch_I16toF16_axis0(\n\
     __write_only image2d_t  output,\n\
     int block_size,\n\
     int block_num,\n\
-    int axis_num\n\
+    int axis_num,\n\
+    int is_array\n\
     )\n\
 {\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
@@ -13891,6 +14552,9 @@ static const char grucell_activation_z_h_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
 _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
 \n\
+_viv_uniform float output_scale1;\n\
+_viv_uniform float output_zp1;\n\
+\n\
 float4 sigmoid_func(float4 x)\n\
 {\n\
     x *= -logE;\n\
@@ -14002,13 +14666,15 @@ __kernel void grucell_activation_z_h_##name0##_F16to##name1##_##act_name( \\\n\
     VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\
     h_tm = h_tm * hstate_in_scale + hstate_in_tail; \\\n\
     float4 result = (1 - z) * h + z * h_tm; \\\n\
-    result = result * output_scale + output_zp; \\\n\
-    int4 dst0; \\\n\
-    _viv_asm(CONV_RTE, dst0, result); \\\n\
+    float4 out0 = result * output_scale + output_zp; \\\n\
+    float4 out1 = result * output_scale1 + output_zp1; \\\n\
+    int4 dst0, dst1; \\\n\
+    _viv_asm(CONV_RTE, dst0, out0); \\\n\
+    _viv_asm(CONV_RTE, dst1, out1); \\\n\
     dst_type dst; \\\n\
-    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
     VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
-    VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_WriteImage(hstate_out, coord_in, dst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
 GRUCELL_QNT_F16TO_QNT(U8,  U8,  SIGMOID,  sigmoid_func, vxc_uchar8, vxc_uchar8)\n\
 GRUCELL_QNT_F16TO_QNT(I8,  I8,  SIGMOID,  sigmoid_func, vxc_char8,  vxc_char8)\n\
@@ -15371,6 +16037,11 @@ float4 tanh_func(float4 x)\n\
     x = 1.0f / x;\n\
     return 2 * x - 1;\n\
 }\n\
+float4 relu_func(float4 x)\n\
+{\n\
+    x = x > 0 ? x : 0;\n\
+    return x;\n\
+}\n\
 \n\
 _viv_uniform VXC_512Bits uniF16PlusF16_0_4x4;\n\
 _viv_uniform VXC_512Bits uniF16PlusF16_1_4x4;\n\
@@ -15434,6 +16105,8 @@ __kernel void grucell_reset_after_activation_F16_F16toF16_##act_name##_##rec_act
 }\n\
 GRUCELL_F16_F16TOF16(TANH,    tanh_func,    SIGMOID, sigmoid_func)\n\
 GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func, SIGMOID, sigmoid_func)\n\
+GRUCELL_F16_F16TOF16(RELU,    relu_func,    SIGMOID, sigmoid_func)\n\
+\n\
 \n\
 _viv_uniform float hstate_in_scale;\n\
 _viv_uniform float hstate_in_tail;\n\
@@ -15499,6 +16172,10 @@ GRUCELL_QNT_F16TO_QNT(I16_F16toI16_TANH_SIGMOID,    tanh_func,    sigmoid_func,
 GRUCELL_QNT_F16TO_QNT(U8_F16toU8_SIGMOID_SIGMOID,   sigmoid_func, sigmoid_func, vxc_uchar8, vxc_uchar8)\n\
 GRUCELL_QNT_F16TO_QNT(I8_F16toI8_SIGMOID_SIGMOID,   sigmoid_func, sigmoid_func, vxc_char8,  vxc_char8)\n\
 GRUCELL_QNT_F16TO_QNT(I16_F16toI16_SIGMOID_SIGMOID, sigmoid_func, sigmoid_func, vxc_short8, vxc_short8)\n\
+GRUCELL_QNT_F16TO_QNT(U8_F16toU8_RELU_SIGMOID,      relu_func,    sigmoid_func, vxc_uchar8, vxc_uchar8)\n\
+GRUCELL_QNT_F16TO_QNT(I8_F16toI8_RELU_SIGMOID,      relu_func,    sigmoid_func, vxc_char8,  vxc_char8)\n\
+GRUCELL_QNT_F16TO_QNT(I16_F16toI16_RELU_SIGMOID,    relu_func,    sigmoid_func, vxc_short8, vxc_short8)\n\
+\n\
 \n\
 #define GRUCELL_BF16(act_name, act_func, rec_act_name, rec_act_func) \\\n\
 __kernel void grucell_reset_after_activation_BF16_BF16toBF16_##act_name##_##rec_act_name( \\\n\
@@ -15561,6 +16238,7 @@ __kernel void grucell_reset_after_activation_BF16_BF16toBF16_##act_name##_##rec_
 }\n\
 GRUCELL_BF16(TANH,    tanh_func,    SIGMOID, sigmoid_func)\n\
 GRUCELL_BF16(SIGMOID, sigmoid_func, SIGMOID, sigmoid_func)\n\
+GRUCELL_BF16(RELU,    relu_func,    SIGMOID, sigmoid_func)\n\
 "; /* end of grucell_reset_after_activation_vx*/
 
 static const char hswish_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -18930,6 +19608,1399 @@ __kernel void layer_norm_BF16F32toBF16_2D(\n\
     }\n\
 }"; /* end of layer_normalization_3_vx*/
 
+static const char layer_normalization_axis01_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform int height;\n\
+_viv_uniform uint group_num;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float inv_multiplier;\n\
+\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_U8_F16toF16(\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\
+    int2 coord_sum = (int2)(0, gidz);\n\
+    int4 coord_para = coord;\n\
+    vxc_uchar16 src0;\n\
+    vxc_short8 src1, outval;\n\
+    vxc_half8 scale_h, dst;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += read_imagef(meanVari, coord_sum);\n\
+        coord_sum.x += 4;\n\
+    }\n\
+    mean_vari *= inv_multiplier;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+    coord_para.z = 0;\n\
+    coord_para.w = 0;\n\
+    int4 coord_bias = coord_para;\n\
+\n\
+    int8 input_desc, scale_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.z, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\
+    _viv_asm(MOV, coord_para.w, baseAddr_c);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr);\n\
+\n\
+    vxc_float4  tmpData0, tmpData1, norm;\n\
+    half4 tmpVal0, tmpVal1;\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in, 0,\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.y ++;\n\
+        coord_para.y = coord.y; coord_bias.y = coord.y;\n\
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x = coord.x;\n\
+\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_0_4x4);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_1_4x4);\n\
+        VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_0_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_1_4x4);\n\
+        tmpData0 = tmpData0 - mean_vari.s0;\n\
+        tmpData1 = tmpData1 - mean_vari.s0;\n\
+\n\
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\
+        _viv_asm(CONV, tmpVal0, norm);\n\
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\
+        _viv_asm(CONV, tmpVal1, norm);\n\
+\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniExtract8Data_2x8);\n\
+        _viv_asm(COPY, outval, dst, 16);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_U8_F32toF16(\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\
+    int2 coord_sum = (int2)(0, gidz);\n\
+    int4 coord_para = coord;\n\
+    vxc_uchar16 src0;\n\
+    vxc_short8 outval;\n\
+    vxc_half8 scale_h, dst;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += read_imagef(meanVari, coord_sum);\n\
+        coord_sum.x += 4;\n\
+    }\n\
+    mean_vari *= inv_multiplier;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+    coord_para.z = 0;\n\
+    coord_para.w = 0;\n\
+    int4 coord_bias = coord_para;\n\
+\n\
+    int8 input_desc, scale_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.z, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr);\n\
+\n\
+    vxc_float4  tmpData0, tmpData1, norm;\n\
+    half4 tmpVal0, tmpVal1;\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in, 0,\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.y ++;\n\
+        coord_bias.y = coord.y;\n\
+\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        scale_f0 = read_imagef(scale, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        scale_f1 = read_imagef(scale, coord_bias);\n\
+        coord_bias.x = coord.x;\n\
+\n\
+        VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_0_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_1_4x4);\n\
+        tmpData0 = tmpData0 - mean_vari.s0;\n\
+        tmpData1 = tmpData1 - mean_vari.s0;\n\
+\n\
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\
+        _viv_asm(CONV, tmpVal0, norm);\n\
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\
+        _viv_asm(CONV, tmpVal1, norm);\n\
+\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniExtract8Data_2x8);\n\
+        _viv_asm(COPY, outval, dst, 16);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_U8_F16toU8(\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\
+    int2 coord_sum = (int2)(0, gidz);\n\
+    int4 coord_para = coord;\n\
+    vxc_uchar16 src0, outval;\n\
+    vxc_short8 src1;\n\
+    vxc_half8 scale_h;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += read_imagef(meanVari, coord_sum);\n\
+        coord_sum.x += 4;\n\
+    }\n\
+    mean_vari *= inv_multiplier;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+    coord_para.z = 0;\n\
+    coord_para.w = 0;\n\
+    int4 coord_bias = coord_para;\n\
+\n\
+    int8 input_desc, scale_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.z, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\
+    _viv_asm(MOV, coord_para.w, baseAddr_c);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr);\n\
+\n\
+    vxc_float4  tmpData0, tmpData1, norm;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in, 0,\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.y ++;\n\
+        coord_para.y = coord.y; coord_bias.y = coord.y;\n\
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x = coord.x;\n\
+\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_0_4x4);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_1_4x4);\n\
+\n\
+        VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_0_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_1_4x4);\n\
+        tmpData0 = tmpData0 - mean_vari.s0;\n\
+        tmpData1 = tmpData1 - mean_vari.s0;\n\
+\n\
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp);\n\
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp);\n\
+\n\
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+                uniExtract8Data_2x8);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_U8_F32toU8(\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\
+    int2 coord_sum = (int2)(0, gidz);\n\
+    int4 coord_para = coord;\n\
+    vxc_uchar16 src0 , outval;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += read_imagef(meanVari, coord_sum);\n\
+        coord_sum.x += 4;\n\
+    }\n\
+    mean_vari *= inv_multiplier;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+    coord_para.z = 0;\n\
+    coord_para.w = 0;\n\
+    int4 coord_bias = coord_para;\n\
+\n\
+    int8 input_desc, scale_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.z, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr);\n\
+\n\
+    vxc_float4  tmpData0, tmpData1, norm;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in, 0,\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.y ++;\n\
+        coord_bias.y = coord.y;\n\
+\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        scale_f0 = read_imagef(scale, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        scale_f1 = read_imagef(scale, coord_bias);\n\
+        coord_bias.x = coord.x;\n\
+\n\
+        VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_0_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_1_4x4);\n\
+        tmpData0 = tmpData0 - mean_vari.s0;\n\
+        tmpData1 = tmpData1 - mean_vari.s0;\n\
+\n\
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp);\n\
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp);\n\
+\n\
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+                uniExtract8Data_2x8);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}"; /* end of layer_normalization_axis01_0_vx*/
+
+static const char layer_normalization_axis01_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform int height;\n\
+_viv_uniform uint group_num;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float inv_multiplier;\n\
+\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I8_F16toF16(\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\
+    int2 coord_sum = (int2)(0, gidz);\n\
+    int4 coord_para = coord;\n\
+    vxc_char16 src0;\n\
+    vxc_short8 src1, outval;\n\
+    vxc_half8 scale_h, dst;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += read_imagef(meanVari, coord_sum);\n\
+        coord_sum.x += 4;\n\
+    }\n\
+    mean_vari *= inv_multiplier;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+    coord_para.z = 0;\n\
+    coord_para.w = 0;\n\
+    int4 coord_bias = coord_para;\n\
+\n\
+    int8 input_desc, scale_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.z, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\
+    _viv_asm(MOV, coord_para.w, baseAddr_c);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr);\n\
+\n\
+    vxc_float4  tmpData0, tmpData1, norm;\n\
+    half4 tmpVal0, tmpVal1;\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in, 0,\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.y ++;\n\
+        coord_para.y = coord.y; coord_bias.y = coord.y;\n\
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x = coord.x;\n\
+\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_0_4x4);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_1_4x4);\n\
+        VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_0_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_1_4x4);\n\
+        tmpData0 = tmpData0 - mean_vari.s0;\n\
+        tmpData1 = tmpData1 - mean_vari.s0;\n\
+\n\
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\
+        _viv_asm(CONV, tmpVal0, norm);\n\
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\
+        _viv_asm(CONV, tmpVal1, norm);\n\
+\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniExtract8Data_2x8);\n\
+        _viv_asm(COPY, outval, dst, 16);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I8_F32toF16(\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\
+    int2 coord_sum = (int2)(0, gidz);\n\
+    int4 coord_para = coord;\n\
+    vxc_char16 src0;\n\
+    vxc_short8 outval;\n\
+    vxc_half8 scale_h, dst;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += read_imagef(meanVari, coord_sum);\n\
+        coord_sum.x += 4;\n\
+    }\n\
+    mean_vari *= inv_multiplier;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+    coord_para.z = 0;\n\
+    coord_para.w = 0;\n\
+    int4 coord_bias = coord_para;\n\
+\n\
+    int8 input_desc, scale_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.z, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr);\n\
+\n\
+    vxc_float4  tmpData0, tmpData1, norm;\n\
+    half4 tmpVal0, tmpVal1;\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in, 0,\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.y ++;\n\
+        coord_bias.y = coord.y;\n\
+\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        scale_f0 = read_imagef(scale, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        scale_f1 = read_imagef(scale, coord_bias);\n\
+        coord_bias.x = coord.x;\n\
+\n\
+        VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_0_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_1_4x4);\n\
+        tmpData0 = tmpData0 - mean_vari.s0;\n\
+        tmpData1 = tmpData1 - mean_vari.s0;\n\
+\n\
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\
+        _viv_asm(CONV, tmpVal0, norm);\n\
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\
+        _viv_asm(CONV, tmpVal1, norm);\n\
+\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniExtract8Data_2x8);\n\
+        _viv_asm(COPY, outval, dst, 16);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I8_F16toU8(\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\
+    int2 coord_sum = (int2)(0, gidz);\n\
+    int4 coord_para = coord;\n\
+    vxc_char16 src0;\n\
+    vxc_uchar16 outval;\n\
+    vxc_short8 src1;\n\
+    vxc_half8 scale_h;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += read_imagef(meanVari, coord_sum);\n\
+        coord_sum.x += 4;\n\
+    }\n\
+    mean_vari *= inv_multiplier;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+    coord_para.z = 0;\n\
+    coord_para.w = 0;\n\
+    int4 coord_bias = coord_para;\n\
+\n\
+    int8 input_desc, scale_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.z, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc));\n\
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0;\n\
+    _viv_asm(MOV, coord_para.w, baseAddr_c);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr);\n\
+\n\
+    vxc_float4  tmpData0, tmpData1, norm;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in, 0,\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.y ++;\n\
+        coord_para.y = coord.y; coord_bias.y = coord.y;\n\
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, 0,\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        coord_bias.x = coord.x;\n\
+\n\
+        _viv_asm(COPY, scale_h, src1, 16);\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_0_4x4);\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_1_4x4);\n\
+\n\
+        VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_0_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_1_4x4);\n\
+        tmpData0 = tmpData0 - mean_vari.s0;\n\
+        tmpData1 = tmpData1 - mean_vari.s0;\n\
+\n\
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp);\n\
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp);\n\
+\n\
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+                uniExtract8Data_2x8);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I8_F32toU8(\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\
+    int2 coord_sum = (int2)(0, gidz);\n\
+    int4 coord_para = coord;\n\
+    vxc_char16 src0;\n\
+    vxc_uchar16 outval;\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1;\n\
+    vxc_float4 mean_vari = (vxc_float4)(0);\n\
+\n\
+    for(int i = 0; i < group_num; i++)\n\
+    {\n\
+        mean_vari += read_imagef(meanVari, coord_sum);\n\
+        coord_sum.x += 4;\n\
+    }\n\
+    mean_vari *= inv_multiplier;\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
+    coord_para.z = 0;\n\
+    coord_para.w = 0;\n\
+    int4 coord_bias = coord_para;\n\
+\n\
+    int8 input_desc, scale_desc, output_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.z, baseAddr_a);\n\
+\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord.z, baseAddr);\n\
+\n\
+    vxc_float4  tmpData0, tmpData1, norm;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in, 0,\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.y ++;\n\
+        coord_bias.y = coord.y;\n\
+\n\
+        bias_f0 = read_imagef(bias, coord_bias);\n\
+        scale_f0 = read_imagef(scale, coord_bias);\n\
+        coord_bias.x += 4;\n\
+        bias_f1 = read_imagef(bias, coord_bias);\n\
+        scale_f1 = read_imagef(scale, coord_bias);\n\
+        coord_bias.x = coord.x;\n\
+\n\
+        VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_0_4x4);\n\
+        VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_1_4x4);\n\
+        tmpData0 = tmpData0 - mean_vari.s0;\n\
+        tmpData1 = tmpData1 - mean_vari.s0;\n\
+\n\
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0;\n\
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp);\n\
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1;\n\
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp);\n\
+\n\
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+                uniExtract8Data_2x8);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    }\n\
+}"; /* end of layer_normalization_axis01_1_vx*/
+
+static const char layer_normalization_axis01_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform int height;\n\
+_viv_uniform uint group_num;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float inv_multiplier;\n\
+\n\
+#define LAYER_NORM_AXIS01_F16_F16to16Bits(name,temp_type,dst_type,conv_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_F16_F16to##name( \\\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \\\n\
+    image2d_array_t output, float eps) \\\n\
+{ \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\
+    int2 coord_sum = (int2)(0, gidz); \\\n\
+    int4 coord_para = coord; \\\n\
+    vxc_short8 src0; \\\n\
+    vxc_short8 src1; \\\n\
+    vxc_half8 scale_h, in_h; \\\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\
+    vxc_float4 mean_vari = (vxc_float4)(0); \\\n\
+ \\\n\
+    for(int i = 0; i < group_num; i++) \\\n\
+    { \\\n\
+        mean_vari += read_imagef(meanVari, coord_sum); \\\n\
+        coord_sum.x += 4; \\\n\
+    } \\\n\
+    mean_vari *= inv_multiplier; \\\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\
+ \\\n\
+    coord_para.z = 0; \\\n\
+    coord_para.w = 0; \\\n\
+    int4 coord_bias = coord_para; \\\n\
+ \\\n\
+    int8 input_desc, scale_desc, output_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.z, baseAddr_a); \\\n\
+ \\\n\
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); \\\n\
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; \\\n\
+    _viv_asm(MOV, coord_para.w, baseAddr_c); \\\n\
+ \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord.z, baseAddr); \\\n\
+ \\\n\
+    vxc_float4  tmpData0, tmpData1; \\\n\
+    vxc_short8 outval; \\\n\
+    temp_type tmpVal0, tmpVal1; \\\n\
+    dst_type dst; \\\n\
+ \\\n\
+    for(coord.y = 0; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.y ++; \\\n\
+        coord_para.y = coord.y; \\\n\
+        coord_bias.y = coord.y; \\\n\
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        bias_f0 = read_imagef(bias, coord_bias); \\\n\
+        coord_bias.x += 4; \\\n\
+        bias_f1 = read_imagef(bias, coord_bias); \\\n\
+        coord_bias.x = coord.x; \\\n\
+ \\\n\
+        _viv_asm(COPY, in_h, src0, 16); \\\n\
+        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniDataToFP32_0_4x4); \\\n\
+        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniDataToFP32_1_4x4); \\\n\
+ \\\n\
+        _viv_asm(COPY, scale_h, src1, 16); \\\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_0_4x4); \\\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_1_4x4); \\\n\
+ \\\n\
+        vxc_float4 sub, norm; \\\n\
+        sub = tmpData0 - mean_vari.s0; \\\n\
+        norm = scale_f0 * mean_vari.s1 * sub + bias_f0; \\\n\
+        norm = norm * output_scale + output_zp; \\\n\
+        _viv_asm(conv_type, tmpVal0, norm); \\\n\
+        sub = tmpData1 - mean_vari.s0; \\\n\
+        norm = scale_f1 * mean_vari.s1 * sub + bias_f1; \\\n\
+        norm = norm * output_scale + output_zp; \\\n\
+        _viv_asm(conv_type, tmpVal1, norm); \\\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                    uniExtract8Data_2x8); \\\n\
+        _viv_asm(COPY, outval, dst, 16); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+LAYER_NORM_AXIS01_F16_F16to16Bits(F16,half4,vxc_half8,CONV)\n\
+LAYER_NORM_AXIS01_F16_F16to16Bits(I16,int4,vxc_short8,CONV_RTE)\n\
+\n\
+\n\
+#define LAYER_NORM_AXIS01_F16_F32to16Bits(name,temp_type,dst_type,conv_type)  \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_F16_F32to##name( \\\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \\\n\
+    image2d_array_t output, float eps) \\\n\
+{ \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\
+    int2 coord_sum = (int2)(0, gidz); \\\n\
+    int4 coord_para = coord; \\\n\
+    vxc_short8 src0; \\\n\
+    vxc_half8 in_h; \\\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\
+    vxc_float4 mean_vari = (vxc_float4)(0); \\\n\
+ \\\n\
+    for(int i = 0; i < group_num; i++) \\\n\
+    { \\\n\
+        mean_vari += read_imagef(meanVari, coord_sum); \\\n\
+        coord_sum.x += 4; \\\n\
+    } \\\n\
+    mean_vari *= inv_multiplier; \\\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\
+ \\\n\
+    coord_para.z = 0; \\\n\
+    coord_para.w = 0; \\\n\
+    int4 coord_bias = coord_para; \\\n\
+ \\\n\
+    int8 input_desc, scale_desc, output_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.z, baseAddr_a); \\\n\
+ \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord.z, baseAddr); \\\n\
+ \\\n\
+    vxc_float4  tmpData0, tmpData1; \\\n\
+    vxc_short8 outval; \\\n\
+    temp_type tmpVal0, tmpVal1; \\\n\
+    dst_type dst; \\\n\
+ \\\n\
+    for(coord.y = 0; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.y ++; \\\n\
+        coord_bias.y = coord.y; \\\n\
+        bias_f0 = read_imagef(bias, coord_bias); \\\n\
+        scale_f0 = read_imagef(scale, coord_bias); \\\n\
+        coord_bias.x += 4; \\\n\
+        bias_f1 = read_imagef(bias, coord_bias); \\\n\
+        scale_f1 = read_imagef(scale, coord_bias); \\\n\
+        coord_bias.x = coord.x; \\\n\
+ \\\n\
+        _viv_asm(COPY, in_h, src0, 16); \\\n\
+        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniDataToFP32_0_4x4); \\\n\
+        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniDataToFP32_1_4x4); \\\n\
+ \\\n\
+        vxc_float4 sub, norm; \\\n\
+        sub = tmpData0 - mean_vari.s0; \\\n\
+        norm = scale_f0 * mean_vari.s1 * sub + bias_f0; \\\n\
+        norm = norm * output_scale + output_zp; \\\n\
+        _viv_asm(conv_type, tmpVal0, norm); \\\n\
+        sub = tmpData1 - mean_vari.s0; \\\n\
+        norm = scale_f1 * mean_vari.s1 * sub + bias_f1; \\\n\
+        norm = norm * output_scale + output_zp; \\\n\
+        _viv_asm(conv_type, tmpVal1, norm); \\\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                    uniExtract8Data_2x8); \\\n\
+        _viv_asm(COPY, outval, dst, 16); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+LAYER_NORM_AXIS01_F16_F32to16Bits(F16,half4,vxc_half8,CONV)\n\
+LAYER_NORM_AXIS01_F16_F32to16Bits(I16,int4,vxc_short8,CONV_RTE)\n\
+\n\
+#define LAYER_NORM_AXIS01_F16_F16toQUANT(name,dst_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_F16_F16to##name( \\\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \\\n\
+    image2d_array_t output, float eps) \\\n\
+{ \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\
+    int2 coord_sum = (int2)(0, gidz); \\\n\
+    int4 coord_para = coord; \\\n\
+    vxc_short8 src0; \\\n\
+    vxc_short8 src1; \\\n\
+    vxc_half8 scale_h, in_h; \\\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\
+    vxc_float4 mean_vari = (vxc_float4)(0); \\\n\
+ \\\n\
+    for(int i = 0; i < group_num; i++) \\\n\
+    { \\\n\
+        mean_vari += read_imagef(meanVari, coord_sum); \\\n\
+        coord_sum.x += 4; \\\n\
+    } \\\n\
+    mean_vari *= inv_multiplier; \\\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\
+ \\\n\
+    coord_para.z = 0; \\\n\
+    coord_para.w = 0; \\\n\
+    int4 coord_bias = coord_para; \\\n\
+ \\\n\
+    int8 input_desc, scale_desc, output_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.z, baseAddr_a); \\\n\
+ \\\n\
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); \\\n\
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; \\\n\
+    _viv_asm(MOV, coord_para.w, baseAddr_c); \\\n\
+ \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord.z, baseAddr); \\\n\
+ \\\n\
+    vxc_float4  tmpData0, tmpData1; \\\n\
+    dst_type outval; \\\n\
+    vxc_int4 tmpVal0, tmpVal1; \\\n\
+ \\\n\
+    for(coord.y = 0; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.y ++; \\\n\
+        coord_para.y = coord.y; \\\n\
+        coord_bias.y = coord.y; \\\n\
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        bias_f0 = read_imagef(bias, coord_bias); \\\n\
+        coord_bias.x += 4; \\\n\
+        bias_f1 = read_imagef(bias, coord_bias); \\\n\
+        coord_bias.x = coord.x; \\\n\
+ \\\n\
+        _viv_asm(COPY, in_h, src0, 16); \\\n\
+        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniDataToFP32_0_4x4); \\\n\
+        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniDataToFP32_1_4x4); \\\n\
+ \\\n\
+        _viv_asm(COPY, scale_h, src1, 16); \\\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_0_4x4); \\\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_1_4x4); \\\n\
+ \\\n\
+        vxc_float4 sub, norm; \\\n\
+        sub = tmpData0 - mean_vari.s0; \\\n\
+        norm = scale_f0 * mean_vari.s1 * sub + bias_f0; \\\n\
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \\\n\
+        sub = tmpData1 - mean_vari.s0; \\\n\
+        norm = scale_f1 * mean_vari.s1 * sub + bias_f1; \\\n\
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \\\n\
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+                uniExtract8Data_2x8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+LAYER_NORM_AXIS01_F16_F16toQUANT(U8,vxc_uchar16)\n\
+LAYER_NORM_AXIS01_F16_F16toQUANT(I8,vxc_char16)\n\
+\n\
+#define LAYER_NORM_AXIS01_F16_F32toQUANT(name,dst_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_F16_F32to##name( \\\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \\\n\
+    image2d_array_t output, float eps) \\\n\
+{ \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\
+    int2 coord_sum = (int2)(0, gidz); \\\n\
+    int4 coord_para = coord; \\\n\
+    vxc_short8 src0; \\\n\
+    vxc_half8 in_h; \\\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\
+    vxc_float4 mean_vari = (vxc_float4)(0); \\\n\
+ \\\n\
+    for(int i = 0; i < group_num; i++) \\\n\
+    { \\\n\
+        mean_vari += read_imagef(meanVari, coord_sum); \\\n\
+        coord_sum.x += 4; \\\n\
+    } \\\n\
+    mean_vari *= inv_multiplier; \\\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\
+ \\\n\
+    coord_para.z = 0; \\\n\
+    coord_para.w = 0; \\\n\
+    int4 coord_bias = coord_para; \\\n\
+ \\\n\
+    int8 input_desc, scale_desc, output_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.z, baseAddr_a); \\\n\
+ \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord.z, baseAddr); \\\n\
+ \\\n\
+    vxc_float4  tmpData0, tmpData1; \\\n\
+    dst_type outval; \\\n\
+    vxc_int4 tmpVal0, tmpVal1; \\\n\
+ \\\n\
+    for(coord.y = 0; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.y ++; \\\n\
+        coord_bias.y = coord.y; \\\n\
+ \\\n\
+        bias_f0 = read_imagef(bias, coord_bias); \\\n\
+        scale_f0 = read_imagef(scale, coord_bias); \\\n\
+        coord_bias.x += 4; \\\n\
+        bias_f1 = read_imagef(bias, coord_bias); \\\n\
+        scale_f1 = read_imagef(scale, coord_bias); \\\n\
+        coord_bias.x = coord.x; \\\n\
+ \\\n\
+        _viv_asm(COPY, in_h, src0, 16); \\\n\
+        VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniDataToFP32_0_4x4); \\\n\
+        VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniDataToFP32_1_4x4); \\\n\
+ \\\n\
+        vxc_float4 sub, norm; \\\n\
+        sub = tmpData0 - mean_vari.s0; \\\n\
+        norm = scale_f0 * mean_vari.s1 * sub + bias_f0; \\\n\
+        tmpVal0 = convert_int4_rte(norm * output_scale + output_zp); \\\n\
+        sub = tmpData1 - mean_vari.s0; \\\n\
+        norm = scale_f1 * mean_vari.s1 * sub + bias_f1; \\\n\
+        tmpVal1 = convert_int4_rte(norm * output_scale + output_zp); \\\n\
+        VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+                uniExtract8Data_2x8); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+LAYER_NORM_AXIS01_F16_F32toQUANT(U8,vxc_uchar16)\n\
+LAYER_NORM_AXIS01_F16_F32toQUANT(I8,vxc_char16)"; /* end of layer_normalization_axis01_2_vx*/
+
+static const char layer_normalization_axis01_3_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\
+_viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform int height;\n\
+_viv_uniform uint group_num;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float inv_multiplier;\n\
+\n\
+#define LAYER_NORM_AXIS01_I16_F16to16Bits(name,temp_type,dst_type,conv_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I16_F16to##name( \\\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \\\n\
+    image2d_array_t output, float eps) \\\n\
+{ \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\
+    int2 coord_sum = (int2)(0, gidz); \\\n\
+    int4 coord_para = coord; \\\n\
+    vxc_short8 src0, src1; \\\n\
+    vxc_half8 scale_h; \\\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\
+    vxc_float4 mean_vari = (vxc_float4)(0); \\\n\
+ \\\n\
+    for(int i = 0; i < group_num; i++) \\\n\
+    { \\\n\
+        mean_vari += read_imagef(meanVari, coord_sum); \\\n\
+        coord_sum.x += 4; \\\n\
+    } \\\n\
+    mean_vari *= inv_multiplier; \\\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\
+ \\\n\
+    coord_para.z = 0; \\\n\
+    coord_para.w = 0; \\\n\
+    int4 coord_bias = coord_para; \\\n\
+ \\\n\
+    int8 input_desc, scale_desc, output_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.z, baseAddr_a); \\\n\
+ \\\n\
+    _viv_asm(COPY, scale_desc, scale, sizeof(scale_desc)); \\\n\
+    int baseAddr_c = (int)coord_para.z * scale_desc.s4 + scale_desc.s0; \\\n\
+    _viv_asm(MOV, coord_para.w, baseAddr_c); \\\n\
+ \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord.z, baseAddr); \\\n\
+ \\\n\
+    vxc_float4  tmpData0, tmpData1, norm; \\\n\
+    temp_type tmpVal0, tmpVal1; \\\n\
+    vxc_short8 outval; \\\n\
+    dst_type dst; \\\n\
+ \\\n\
+    for(coord.y = 0; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.y ++; \\\n\
+        coord_para.y = coord.y; \\\n\
+        coord_bias.y = coord.y; \\\n\
+        VXC_OP4(img_load_3d, src1, scale, coord_para.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        bias_f0 = read_imagef(bias, coord_bias); \\\n\
+        coord_bias.x += 4; \\\n\
+        bias_f1 = read_imagef(bias, coord_bias); \\\n\
+        coord_bias.x = coord.x; \\\n\
+ \\\n\
+        _viv_asm(COPY, scale_h, src1, 16); \\\n\
+        VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_0_4x4); \\\n\
+        VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_1_4x4); \\\n\
+        VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_0_4x4); \\\n\
+        VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_1_4x4); \\\n\
+        tmpData0 = tmpData0 - mean_vari.s0; \\\n\
+        tmpData1 = tmpData1 - mean_vari.s0; \\\n\
+ \\\n\
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; \\\n\
+        norm = norm * output_scale + output_zp; \\\n\
+        _viv_asm(conv_type, tmpVal0, norm); \\\n\
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; \\\n\
+        norm = norm * output_scale + output_zp; \\\n\
+        _viv_asm(conv_type, tmpVal1, norm); \\\n\
+ \\\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+                uniExtract8Data_2x8); \\\n\
+        _viv_asm(COPY, outval, dst, 16); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+LAYER_NORM_AXIS01_I16_F16to16Bits(F16,half4,vxc_half8,CONV)\n\
+LAYER_NORM_AXIS01_I16_F16to16Bits(I16,int4,vxc_short8,CONV_RTE)\n\
+\n\
+\n\
+#define LAYER_NORM_AXIS01_I16_F32to16Bits(name,temp_type,dst_type,conv_type) \\\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_I16_F32to##name( \\\n\
+    image2d_array_t input, image2d_array_t bias, image2d_array_t scale, image2d_t meanVari, \\\n\
+    image2d_array_t output, float eps) \\\n\
+{ \\\n\
+    int gidz = get_global_id(1); \\\n\
+    int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\
+    int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\
+    int2 coord_sum = (int2)(0, gidz); \\\n\
+    int4 coord_para = coord; \\\n\
+    vxc_short8 src0; \\\n\
+    vxc_float4 bias_f0, bias_f1, scale_f0, scale_f1; \\\n\
+    vxc_float4 mean_vari = (vxc_float4)(0); \\\n\
+ \\\n\
+    for(int i = 0; i < group_num; i++) \\\n\
+    { \\\n\
+        mean_vari += read_imagef(meanVari, coord_sum); \\\n\
+        coord_sum.x += 4; \\\n\
+    } \\\n\
+    mean_vari *= inv_multiplier; \\\n\
+    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\
+    mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\
+ \\\n\
+    coord_para.z = 0; \\\n\
+    coord_para.w = 0; \\\n\
+    int4 coord_bias = coord_para; \\\n\
+ \\\n\
+    int8 input_desc, scale_desc, output_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr_a = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.z, baseAddr_a); \\\n\
+ \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    int baseAddr = (int)coord.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord.z, baseAddr); \\\n\
+ \\\n\
+    vxc_float4  tmpData0, tmpData1, norm; \\\n\
+    temp_type tmpVal0, tmpVal1; \\\n\
+    vxc_short8 outval; \\\n\
+    dst_type dst; \\\n\
+ \\\n\
+    for(coord.y = 0; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord_in.y ++; \\\n\
+        coord_bias.y = coord.y; \\\n\
+        bias_f0 = read_imagef(bias, coord_bias); \\\n\
+        scale_f0 = read_imagef(scale, coord_bias); \\\n\
+        coord_bias.x += 4; \\\n\
+        bias_f1 = read_imagef(bias, coord_bias); \\\n\
+        scale_f1 = read_imagef(scale, coord_bias); \\\n\
+        coord_bias.x = coord.x; \\\n\
+ \\\n\
+        VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_0_4x4); \\\n\
+        VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniDataToFP32_1_4x4); \\\n\
+        tmpData0 = tmpData0 - mean_vari.s0; \\\n\
+        tmpData1 = tmpData1 - mean_vari.s0; \\\n\
+ \\\n\
+        norm = scale_f0 * mean_vari.s1 * tmpData0 + bias_f0; \\\n\
+        norm = norm * output_scale + output_zp; \\\n\
+        _viv_asm(conv_type, tmpVal0, norm); \\\n\
+        norm = scale_f1 * mean_vari.s1 * tmpData1 + bias_f1; \\\n\
+        norm = norm * output_scale + output_zp; \\\n\
+        _viv_asm(conv_type, tmpVal1, norm); \\\n\
+ \\\n\
+        VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+                uniExtract8Data_2x8); \\\n\
+        _viv_asm(COPY, outval, dst, 16); \\\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord, outval, \\\n\
+                VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+LAYER_NORM_AXIS01_I16_F32to16Bits(F16,half4,vxc_half8,CONV)\n\
+LAYER_NORM_AXIS01_I16_F32to16Bits(I16,int4,vxc_short8,CONV_RTE)\n\
+"; /* end of layer_normalization_axis01_3_vx*/
+
+static const char layer_normalization_axis01_sum_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniSumX_16x1;\n\
+_viv_uniform VXC_512Bits uniSumX2_16x1;\n\
+_viv_uniform VXC_512Bits uniSum_X_X2_8x2;\n\
+_viv_uniform int width;\n\
+_viv_uniform int height;\n\
+\n\
+\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_sums_F16toF32(\n\
+    image2d_array_t input, image2d_t output)\n\
+{\n\
+    int gidx = get_global_id(0) << 3;\n\
+    int lidx = get_local_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
+    vxc_short8 src0;\n\
+    vxc_half8 in_h;\n\
+    vxc_float4 sumsqr;\n\
+    vxc_float4 tmpSumSqr = (vxc_float4)(0);\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr_a);\n\
+\n\
+    if(gidx < width)\n\
+    {\n\
+        for(coord.y = 0; coord.y < height;)\n\
+        {\n\
+            VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+            _viv_asm(COPY, in_h, src0, 16);\n\
+            VXC_DP8x2(sumsqr, in_h, in_h, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniSum_X_X2_8x2);\n\
+            tmpSumSqr += sumsqr;\n\
+        }\n\
+    }\n\
+\n\
+    lcl_sum[lidx] = tmpSumSqr.x;\n\
+    lcl_sqr[lidx] = tmpSumSqr.y;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+\n\
+        float sum = 0;\n\
+        float sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+\n\
+        float4 data = (float4)(sum, sqr, 0, 0);\n\
+        write_imagef(output, coord_out, data);\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_sums_I16toF32(\n\
+    image2d_array_t input, image2d_t output)\n\
+{\n\
+    int gidx = get_global_id(0) << 3;\n\
+    int lidx = get_local_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
+    vxc_short8 src0;\n\
+    float4 tmpSumSqr = (float4)(0);\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr_a);\n\
+\n\
+    if(gidx < width)\n\
+    {\n\
+        for(coord.y = 0; coord.y < height;)\n\
+        {\n\
+            VXC_OP4(img_load_3d, src0, input, coord.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+            vxc_float4 sumsqr;\n\
+            VXC_DP8x2(sumsqr, src0, src0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0),\\\n\
+                    uniSum_X_X2_8x2);\n\
+            tmpSumSqr += sumsqr;\n\
+        }\n\
+    }\n\
+    lcl_sum[lidx] = tmpSumSqr.x;\n\
+    lcl_sqr[lidx] = tmpSumSqr.y;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+        float4 data = (float4)(0);\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            data.x += dot(tmp_sum[i], one);\n\
+            data.y += dot(tmp_sqr[i], one);\n\
+        }\n\
+        write_imagef(output, coord_out, data);\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_sums_U8toF32(\n\
+    image2d_array_t input, image2d_t output)\n\
+{\n\
+    int gidx = get_global_id(0) << 4;\n\
+    int lidx = get_local_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
+    vxc_uchar16 src0;\n\
+    float sum = 0, sqr = 0;\n\
+    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr_a);\n\
+\n\
+    if(gidx < width)\n\
+    {\n\
+        for(coord.y = 0; coord.y < height;)\n\
+        {\n\
+            VXC_OP4(img_load_3d, src0, input, coord.xywz, 0,\n\
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1);\n\
+            tmpSum += (tmpSum1);\n\
+            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1);\n\
+            tmpSqr += (tmpSqr1);\n\
+        }\n\
+        sqr += convert_float(tmpSqr);\n\
+        sum = convert_float(tmpSum);\n\
+    }\n\
+    lcl_sum[lidx] = sum;\n\
+    lcl_sqr[lidx] = sqr;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+        sum = 0; sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+        float4 data = (float4)(sum, sqr, 0, 0);\n\
+        write_imagef(output, coord_out, data);\n\
+    }\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void layernorm_axis01_sums_I8toF32(\n\
+    image2d_array_t input, image2d_t output)\n\
+{\n\
+    int gidx = get_global_id(0) << 4;\n\
+    int lidx = get_local_id(0);\n\
+    int gidz = get_global_id(1);\n\
+    int4 coord = (int4)(gidx, 0, gidz, 0);\n\
+    vxc_char16 src0;\n\
+    float sum = 0, sqr = 0;\n\
+    int tmpSum = 0, tmpSqr = 0, tmpSum1, tmpSqr1;\n\
+\n\
+    __local float lcl_sum[16];\n\
+    __local float lcl_sqr[16];\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr_a = (int)coord.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord.w, baseAddr_a);\n\
+\n\
+    if(gidx < width)\n\
+    {\n\
+        for(coord.y = 0; coord.y < height;)\n\
+        {\n\
+            VXC_OP4(img_load_3d, src0, input, coord.xywz, 0,\n\
+                    VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+            VXC_DP16x1(tmpSum1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX_16x1);\n\
+            tmpSum += (tmpSum1);\n\
+            VXC_DP16x1(tmpSqr1, src0, src0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0), uniSumX2_16x1);\n\
+            tmpSqr += (tmpSqr1);\n\
+        }\n\
+        sqr += convert_float(tmpSqr);\n\
+        sum = convert_float(tmpSum);\n\
+    }\n\
+    lcl_sum[lidx] = sum;\n\
+    lcl_sqr[lidx] = sqr;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    int2 coord_out = (int2)(get_group_id(0) << 2, gidz);\n\
+    if(lidx == 0)\n\
+    {\n\
+        float4 one = (float4)(1, 1, 1, 1);\n\
+        __local float4* tmp_sum = (__local float4*)lcl_sum;\n\
+        __local float4* tmp_sqr = (__local float4*)lcl_sqr;\n\
+        sum = 0; sqr = 0;\n\
+        for(int i = 0; i < 4; i++)\n\
+        {\n\
+            sum += dot(tmp_sum[i], one);\n\
+            sqr += dot(tmp_sqr[i], one);\n\
+        }\n\
+        float4 data = (float4)(sum, sqr, 0, 0);\n\
+        write_imagef(output, coord_out, data);\n\
+    }\n\
+}\n\
+"; /* end of layer_normalization_axis01_sum_vx*/
+
 static const char log_softmax_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 _viv_uniform float       rlogE;\n\
 _viv_uniform int         axisSize;\n\
@@ -20209,6 +22280,742 @@ __kernel void log_softmax_axis2_BF16toF32(\n\
 }\n\
 "; /* end of log_softmax_axis2_vx*/
 
+static const char log_softmax_exceed_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+_viv_uniform float       rlogE;\n\
+_viv_uniform int         axisSize;\n\
+_viv_uniform float       betaValue;\n\
+_viv_uniform float       scaleLogE;\n\
+_viv_uniform float       outputScale;\n\
+_viv_uniform float       output_offset_asymmetric;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform int         height;\n\
+_viv_uniform int         inputWidth;\n\
+_viv_uniform int         inputWidthRemain4;\n\
+_viv_uniform VXC_512Bits uniGetSubData0to3_4x4;\n\
+_viv_uniform VXC_512Bits uniGetSubData4to7_4x4;\n\
+_viv_uniform VXC_512Bits uniPackMaxData_2x8;\n\
+\n\
+#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0(read_fun, vert_max_fun, horz_max_fun) \\\n\
+    read_fun(val0, input,  coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, val, val0, 16); \\\n\
+    for (coord.y = 0; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        for (coord.x = 16;coord.x < (axisSize + 16);coord.x+=32) \\\n\
+        { \\\n\
+            read_fun(val0, input,  coord, VXC_5BITOFFSET_XY(-16, 0), \\\n\
+                                          VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            _viv_asm(COPY, img_val0, val0, 16); \\\n\
+            read_fun(val1, input,  coord, VXC_5BITOFFSET_XY(-8, 0), \\\n\
+                                          VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            _viv_asm(COPY, img_val1, val1, 16); \\\n\
+            read_fun(val2, input,  coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            _viv_asm(COPY, img_val2, val2, 16); \\\n\
+            read_fun(val3, input,  coord, VXC_5BITOFFSET_XY(8, 0), \\\n\
+                                          VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            _viv_asm(COPY, img_val3, val3, 16); \\\n\
+            vert_max_fun(val, img_val0, img_val1, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            vert_max_fun(val, img_val2, img_val3, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+    horz_max_fun(val, val, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_DP2x8(val, val, val, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniPackMaxData_2x8); \\\n\
+    horz_max_fun(val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    vxc_float4 prob; \\\n\
+    float fProbSum = 0; \\\n\
+    const float4 one4 = (float4)(1.0, 1.0, 1.0, 1.0); \\\n\
+    for (coord.y = 0; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        for (coord.x = 0;coord.x < inputWidth;coord.x+=4) \\\n\
+        { \\\n\
+            read_fun(val0, input,  coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+            _viv_asm(COPY, img_val0, val0, 16); \\\n\
+            VXC_DP4x4(prob, img_val0, val,\\\n\
+                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \\\n\
+            prob *= scaleLogE; \\\n\
+            prob = exp2(prob); \\\n\
+            fProbSum += dot(prob, one4); \\\n\
+        } \\\n\
+    } \\\n\
+    read_fun(val0, input,  coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, img_val0, val0, 16); \\\n\
+    VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \\\n\
+    prob *= scaleLogE; \\\n\
+    if(inputWidthRemain4 == 1) \\\n\
+    { \\\n\
+        prob.x = exp2(prob.x); \\\n\
+        prob.yzw = 0; \\\n\
+        fProbSum += dot(prob, one4); \\\n\
+    } \\\n\
+    else if(inputWidthRemain4 == 2) \\\n\
+    { \\\n\
+        prob.x = exp2(prob.x); \\\n\
+        prob.y = exp2(prob.y); \\\n\
+        prob.zw = 0; \\\n\
+        fProbSum += dot(prob, one4); \\\n\
+    } \\\n\
+    else if(inputWidthRemain4 == 3) \\\n\
+    { \\\n\
+        prob.x = exp2(prob.x); \\\n\
+        prob.y = exp2(prob.y); \\\n\
+        prob.z = exp2(prob.z); \\\n\
+        prob.w = 0; \\\n\
+        fProbSum += dot(prob, one4); \\\n\
+    } \\\n\
+    vxc_float4 probSum_log; \\\n\
+    probSum_log.x = log2(fProbSum) * rlogE;\n\
+\n\
+#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_SAVE(dst_type, \\\n\
+        save_type, conv_mode, OUT_SCALE, OUT_OFFSET, read_fun, write_fun) \\\n\
+    for (coord.y = 0; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        for (coord.x = 0; coord.x < axisSize; coord.x += 8) \\\n\
+        { \\\n\
+            dst_type vec0, vec1; \\\n\
+            save_type dst; \\\n\
+            read_fun(val0, input,  coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            _viv_asm(COPY, img_val0, val0, 16); \\\n\
+            VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \\\n\
+            prob = prob * betaValue - probSum_log.xxxx; \\\n\
+            prob = prob * OUT_SCALE + OUT_OFFSET; \\\n\
+            _viv_asm(conv_mode, vec0, prob); \\\n\
+            VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData4to7_4x4); \\\n\
+            prob = prob * betaValue - probSum_log.xxxx; \\\n\
+            prob = prob * OUT_SCALE + OUT_OFFSET; \\\n\
+            _viv_asm(conv_mode, vec1, prob); \\\n\
+            VXC_DP2x8(dst, vec0, vec1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \\\n\
+            write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    }\n\
+\n\
+#define LOGSOFTMAX_EXCEED_AXIS0(src_name, dst_name, src_type, copy_type, dst_type,\\\n\
+                        save_type, conv_mode, OUT_SCALE, OUT_OFFSET, vert_max_fun, horz_max_fun) \\\n\
+__kernel void log_softmax_exceed_axis0_##src_name##to##dst_name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    float input_Scale, \\\n\
+    int   axisVal \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(16, 0, get_global_id(1), 0); \\\n\
+    src_type img_val0, img_val1, img_val2, img_val3; \\\n\
+    copy_type val0, val1, val2, val3; \\\n\
+    src_type val; \\\n\
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS0(VXC_ReadImage2DArray, vert_max_fun, horz_max_fun) \\\n\
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS0_SAVE(dst_type, save_type, conv_mode,\\\n\
+    OUT_SCALE, OUT_OFFSET, VXC_ReadImage2DArray, VXC_WriteImage2DArray); \\\n\
+}\n\
+\n\
+LOGSOFTMAX_EXCEED_AXIS0(F16, F16, vxc_half8, vxc_short8, half4,  vxc_short8,\\\n\
+CONV, 1, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\
+LOGSOFTMAX_EXCEED_AXIS0(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\\\n\
+CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\
+LOGSOFTMAX_EXCEED_AXIS0(F16, I8,  vxc_half8, vxc_short8, char4,  vxc_char8,\\\n\
+CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\
+LOGSOFTMAX_EXCEED_AXIS0(F16, U8,  vxc_half8, vxc_short8, uchar4, vxc_uchar8,\\\n\
+                 CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\
+LOGSOFTMAX_EXCEED_AXIS0(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\\\n\
+CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\
+LOGSOFTMAX_EXCEED_AXIS0(I16, F16, vxc_short8, vxc_short8, half4,  vxc_short8,\\\n\
+CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\
+LOGSOFTMAX_EXCEED_AXIS0(I8, I8,  vxc_char16, vxc_char16, char4,  vxc_char8,\\\n\
+CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\
+LOGSOFTMAX_EXCEED_AXIS0(I8, F16, vxc_char16, vxc_char16, half4,  vxc_short8,\\\n\
+CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\
+LOGSOFTMAX_EXCEED_AXIS0(U8, U8,  vxc_uchar16, vxc_uchar16, uchar4, vxc_uchar8,\\\n\
+CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\
+LOGSOFTMAX_EXCEED_AXIS0(U8, F16, vxc_uchar16, vxc_uchar16, half4,  vxc_short8,\\\n\
+CONV, 1, 0, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\
+\n\
+\n\
+#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_TOF32_SAVE(read_fun) \\\n\
+    for (coord.y = 0; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        for (coord.x = 0; coord.x < axisSize; ) \\\n\
+        { \\\n\
+            read_fun(val0, input,  coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            _viv_asm(COPY, img_val0, val0, 16); \\\n\
+            VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData0to3_4x4); \\\n\
+            prob = prob * betaValue - probSum_log.xxxx; \\\n\
+            write_imagef(output, coord, prob); \\\n\
+            coord.x += 4; \\\n\
+            VXC_DP4x4(prob, img_val0, val, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniGetSubData4to7_4x4); \\\n\
+            prob = prob * betaValue - probSum_log.xxxx; \\\n\
+            write_imagef(output, coord, prob); \\\n\
+            coord.x += 4; \\\n\
+        } \\\n\
+    }\n\
+\n\
+#define LOGSOFTMAX_EXCEED_AXIS0_TOF32(src_name, src_type, copy_type, vert_max_fun, horz_max_fun) \\\n\
+__kernel void log_softmax_exceed_axis0_##src_name##toF32 \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    float input_Scale, \\\n\
+    int   axisVal \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(16, 0, get_global_id(1), 0); \\\n\
+    src_type img_val0, img_val1, img_val2, img_val3; \\\n\
+    copy_type val0, val1, val2, val3; \\\n\
+    src_type val; \\\n\
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS0(VXC_ReadImage2DArray, vert_max_fun, horz_max_fun) \\\n\
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS0_TOF32_SAVE(VXC_ReadImage2DArray) \\\n\
+}\n\
+\n\
+LOGSOFTMAX_EXCEED_AXIS0_TOF32(F16, vxc_half8,   vxc_short8, VXC_VertMax3_Half, VXC_HorzMax3_Half)\n\
+LOGSOFTMAX_EXCEED_AXIS0_TOF32(I16, vxc_short8,  vxc_short8, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\
+LOGSOFTMAX_EXCEED_AXIS0_TOF32(I8,  vxc_char16,  vxc_char16, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\
+LOGSOFTMAX_EXCEED_AXIS0_TOF32(U8,  vxc_uchar16, vxc_uchar16, VXC_VertMax3_Integer, VXC_HorzMax3_Integer)\n\
+\n\
+"; /* end of log_softmax_exceed_axis0_vx*/
+
+static const char log_softmax_exceed_axis0_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+_viv_uniform float       rlogE;\n\
+_viv_uniform int         axisSize;\n\
+_viv_uniform float       betaValue;\n\
+_viv_uniform float       scaleLogE;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+\n\
+_viv_uniform int         height;\n\
+_viv_uniform int         inputWidth;\n\
+_viv_uniform int         inputWidthRemain4;\n\
+_viv_uniform VXC_512Bits uniPackMaxData_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractHalf4_4x4;\n\
+\n\
+\n\
+#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16(read_fun) \\\n\
+    vxc_half8 img_val0, img_val1, img_val2, img_val3; \\\n\
+    vxc_short8 val0, val1, val2, val3; \\\n\
+    vxc_half8 val; \\\n\
+    read_fun(val0, input,  coord, VXC_5BITOFFSET_XY(-16, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, val, val0, 16); \\\n\
+    for (coord.y = 0; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        for (coord.x = 16; coord.x < (axisSize + 16);) \\\n\
+        { \\\n\
+            read_fun(val0, input,  coord, VXC_5BITOFFSET_XY(-16, 0), \\\n\
+                                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            _viv_asm(COPY, img_val0, val0, 16); \\\n\
+            read_fun(val1, input,  coord, VXC_5BITOFFSET_XY(-8, 0), \\\n\
+                                  VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            _viv_asm(COPY, img_val1, val1, 16); \\\n\
+            read_fun(val2, input,  coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            _viv_asm(COPY, img_val2, val2, 16); \\\n\
+            read_fun(val3, input,  coord, VXC_5BITOFFSET_XY(8, 0), \\\n\
+                                  VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            _viv_asm(COPY, img_val3, val3, 16); \\\n\
+            coord.x += 32; \\\n\
+            VXC_VertMax3_Half(val, img_val0, img_val1, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_VertMax3_Half(val, img_val2, img_val3, val, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+    VXC_HorzMax3_Half(val, val, VXC_MODIFIER(0, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_DP2x8(val, val, val, VXC_MODIFIER(0, 2, 0, VXC_RM_TowardZero, 0), uniPackMaxData_2x8); \\\n\
+    VXC_HorzMax3_Half(val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\
+    vxc_ushort8   bf_val_tmp; \\\n\
+    vxc_float4 vecA; \\\n\
+    _viv_asm(COPY, bf_val_tmp, val, 16); \\\n\
+    VXC_DP2x8(bf_val_tmp, bf_val_tmp, zero,\\\n\
+    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\
+    _viv_asm(COPY, vecA, bf_val_tmp, 16); \\\n\
+    vxc_float4 prob; \\\n\
+    float fProbSum = 0; \\\n\
+    const float4 one4 = (float4)(1.0, 1.0, 1.0, 1.0); \\\n\
+    float max_value = vecA.x * scaleLogE; \\\n\
+    float max_value_orig = vecA.x; \\\n\
+    for (coord.y = 0; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        for (coord.x = 0; coord.x < inputWidth; ) \\\n\
+        { \\\n\
+            read_fun(val0, input,  coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_DP2x8(bf_val_tmp, val0, zero,\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\
+            _viv_asm(COPY, prob, bf_val_tmp, 16); \\\n\
+            prob = prob * scaleLogE - max_value; \\\n\
+            prob = exp2(prob); \\\n\
+            fProbSum += dot(prob, one4); \\\n\
+            coord.x += 4; \\\n\
+        } \\\n\
+    } \\\n\
+    read_fun(val0, input,  coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_DP2x8(bf_val_tmp, val0, zero,\\\n\
+    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\
+    _viv_asm(COPY, prob, bf_val_tmp, 16); \\\n\
+    prob = prob * scaleLogE - max_value; \\\n\
+    if(inputWidthRemain4 == 1) \\\n\
+    { \\\n\
+        prob.x = exp2(prob.x); \\\n\
+        prob.yzw = 0; \\\n\
+        fProbSum += dot(prob, one4); \\\n\
+    } \\\n\
+    else if(inputWidthRemain4 == 2) \\\n\
+    { \\\n\
+        prob.x = exp2(prob.x); \\\n\
+        prob.y = exp2(prob.y); \\\n\
+        prob.zw = 0; \\\n\
+        fProbSum += dot(prob, one4); \\\n\
+    } \\\n\
+    else if(inputWidthRemain4 == 3) \\\n\
+    { \\\n\
+        prob.x = exp2(prob.x); \\\n\
+        prob.y = exp2(prob.y); \\\n\
+        prob.z = exp2(prob.z); \\\n\
+        prob.w = 0; \\\n\
+        fProbSum += dot(prob, one4); \\\n\
+    } \\\n\
+    vxc_float4 probSum_log; \\\n\
+    probSum_log.x = log2(fProbSum) * rlogE;\n\
+\n\
+#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOBF16_SAVE(read_fun, write_fun) \\\n\
+    for (coord.y = 0; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        for (coord.x = 0; coord.x < axisSize; ) \\\n\
+        { \\\n\
+            read_fun(val0, input,  coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_DP2x8(bf_val_tmp, val0, zero,\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\
+            _viv_asm(COPY, prob, bf_val_tmp, 16); \\\n\
+            prob = prob - max_value_orig; \\\n\
+            prob = prob * betaValue - probSum_log.xxxx; \\\n\
+            vxc_ushort8 tmp, dst; \\\n\
+            _viv_asm(COPY, tmp, prob, 16); \\\n\
+            dst.s0123 = tmp.s1357; \\\n\
+            write_fun(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.x += 4; \\\n\
+        } \\\n\
+    }\n\
+\n\
+#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOF16_SAVE(read_fun, write_fun) \\\n\
+    for (coord.y = 0; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        for (coord.x = 0; coord.x < axisSize; ) \\\n\
+        { \\\n\
+            read_fun(val0, input,  coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_DP2x8(bf_val_tmp, val0, zero,\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\
+            _viv_asm(COPY, prob, bf_val_tmp, 16); \\\n\
+            prob = prob - max_value_orig; \\\n\
+            prob = prob * betaValue - probSum_log.xxxx; \\\n\
+            half4 vec; \\\n\
+            vxc_half4 tmp; \\\n\
+            vxc_short4 dst; \\\n\
+            _viv_asm(CONV, vec, prob); \\\n\
+            VXC_DP4x4(tmp, vec, vec, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf4_4x4); \\\n\
+            _viv_asm(COPY, dst, tmp, 8); \\\n\
+            write_fun(output, coord, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.x += 4; \\\n\
+        } \\\n\
+    }\n\
+\n\
+#define LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOF32_SAVE(read_fun) \\\n\
+    for (coord.y = 0; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        for (coord.x = 0; coord.x < axisSize; ) \\\n\
+        { \\\n\
+            read_fun(val0, input,  coord, 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_DP2x8(bf_val_tmp, val0, zero,\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\
+            _viv_asm(COPY, prob, bf_val_tmp, 16); \\\n\
+            prob = prob - max_value_orig; \\\n\
+            prob = prob * betaValue - probSum_log.xxxx; \\\n\
+            write_imagef(output, coord, prob); \\\n\
+            coord.x += 4; \\\n\
+        } \\\n\
+    }\n\
+\n\
+__kernel void log_softmax_exceed_axis0_BF16toBF16(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    float input_Scale,\n\
+    int   axisVal )\n\
+{\n\
+    int4 coord = (int4)(16, 0, get_global_id(1), 0);\n\
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16(VXC_ReadImage2DArray)\n\
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOBF16_SAVE(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\
+}\n\
+__kernel void log_softmax_exceed_axis0_BF16toF16(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    float input_Scale,\n\
+    int   axisVal )\n\
+{\n\
+    int4 coord = (int4)(16, 0, get_global_id(1), 0);\n\
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16(VXC_ReadImage2DArray)\n\
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOF16_SAVE(VXC_ReadImage2DArray, VXC_WriteImage2DArray)\n\
+}\n\
+__kernel void log_softmax_exceed_axis0_BF16toF32(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    float input_Scale,\n\
+    int   axisVal )\n\
+{\n\
+    int4 coord = (int4)(16, 0, get_global_id(1), 0);\n\
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16(VXC_ReadImage2DArray)\n\
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS0_BF16TOF32_SAVE(VXC_ReadImage2DArray)\n\
+}\n\
+\n\
+"; /* end of log_softmax_exceed_axis0_BF16_vx*/
+
+static const char log_softmax_exceed_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float       rlogE;\n\
+_viv_uniform int         depth;\n\
+_viv_uniform int         axisSize;\n\
+_viv_uniform float       betaValue;\n\
+_viv_uniform float       scaleLogE;\n\
+_viv_uniform float       outputScale;\n\
+_viv_uniform float       output_offset_asymmetric;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniGetSubLoData_4x4;\n\
+_viv_uniform VXC_512Bits uniGetSubHiData_4x4;\n\
+\n\
+#define LOGSOFTMAX_PROCESS_EXCEED_AXIS1(read_fun, vert_max_fun) \\\n\
+    read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, max, in0, 16); \\\n\
+    for (coord.z = 0; coord.z < depth; coord.z ++) \\\n\
+    { \\\n\
+        for (coord.y = 0; coord.y < axisSize;) \\\n\
+        { \\\n\
+            read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            _viv_asm(COPY, vec0, in0, 16); \\\n\
+            vert_max_fun(max, max, max, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.y++; \\\n\
+        } \\\n\
+    } \\\n\
+    coord.y = 0; \\\n\
+    sum0 = 0; \\\n\
+    sum1 = 0; \\\n\
+    for (coord.z = 0; coord.z < depth; coord.z ++) \\\n\
+    { \\\n\
+        for (coord.y = 0; coord.y < axisSize;coord.y++) \\\n\
+        { \\\n\
+            read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            _viv_asm(COPY, vec0, in0, 16); \\\n\
+            VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \\\n\
+                                         VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \\\n\
+            data0 *= scaleLogE; \\\n\
+            data0 = exp2(data0); \\\n\
+            sum0 += data0; \\\n\
+            VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \\\n\
+                                         VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \\\n\
+            data0 *= scaleLogE; \\\n\
+            data0 = exp2(data0); \\\n\
+            sum1 += data0; \\\n\
+        } \\\n\
+    } \\\n\
+    sum0 = log2(sum0) * rlogE; \\\n\
+    sum1 = log2(sum1) * rlogE;\n\
+\n\
+#define LOGSOFTMAX_PROCESS_EXCEED_AXIS1_SAVE(dst_type, save_type, conv_mode,\\\n\
+                 OUT_SCALE, OUT_OFFSET, read_fun, write_fun) \\\n\
+    coord.y = 0; \\\n\
+    dst_type dst0, dst1; \\\n\
+    save_type vect; \\\n\
+    for (coord.z = 0; coord.z < depth; coord.z ++) \\\n\
+    { \\\n\
+        for (coord.y = 0; coord.y < axisSize;) \\\n\
+        { \\\n\
+            read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            _viv_asm(COPY, vec0, in0, 16); \\\n\
+            VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \\\n\
+                                              VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \\\n\
+            data0 = data0 * betaValue - sum0; \\\n\
+            data0 = data0 * OUT_SCALE + OUT_OFFSET; \\\n\
+            _viv_asm(conv_mode, dst0, data0); \\\n\
+            VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \\\n\
+                                               VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \\\n\
+            data0 = data0 * betaValue - sum1; \\\n\
+            data0 = data0 * OUT_SCALE + OUT_OFFSET; \\\n\
+            _viv_asm(conv_mode, dst1, data0); \\\n\
+            VXC_DP2x8(vect, dst0, dst1, VXC_MODIFIER(0, 7, 0, \\\n\
+                                          VXC_RM_TowardZero, 0), uniConvertInt32toUint8_2x8); \\\n\
+            write_fun(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.y++; \\\n\
+        } \\\n\
+    } \\\n\
+\n\
+#define LOGSOFTMAX_EXCEED_AXIS1(src_name, dst_name, src_type, copy_type, dst_type,\\\n\
+save_type, conv_mode, OUT_SCALE, OUT_OFFSET, vert_max_fun) \\\n\
+__kernel void log_softmax_exceed_axis1_##src_name##to##dst_name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    float input_Scale, \\\n\
+    int   axisVal \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), 0, 0, 0); \\\n\
+    src_type vec0, max; \\\n\
+    copy_type in0; \\\n\
+    vxc_float4 data0; \\\n\
+    vxc_float4 sum0, sum1; \\\n\
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS1(VXC_ReadImage2DArray, vert_max_fun) \\\n\
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS1_SAVE(dst_type, save_type, conv_mode,\\\n\
+    OUT_SCALE, OUT_OFFSET, VXC_ReadImage2DArray, VXC_WriteImage2DArray); \\\n\
+}\n\
+\n\
+\n\
+\n\
+LOGSOFTMAX_EXCEED_AXIS1(F16, F16, vxc_half8, vxc_short8, half4,  vxc_short8,\\\n\
+CONV, 1, 0, VXC_VertMax3_Half)\n\
+LOGSOFTMAX_EXCEED_AXIS1(F16, I16, vxc_half8, vxc_short8, short4, vxc_short8,\\\n\
+CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half)\n\
+LOGSOFTMAX_EXCEED_AXIS1(F16, I8,  vxc_half8, vxc_short8, char4,  vxc_char8,\\\n\
+CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Half)\n\
+LOGSOFTMAX_EXCEED_AXIS1(F16, U8,  vxc_half8, vxc_short8, uchar4,\\\n\
+vxc_uchar8, CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Half)\n\
+\n\
+LOGSOFTMAX_EXCEED_AXIS1(I16, I16, vxc_short8, vxc_short8, short4, vxc_short8,\\\n\
+CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer)\n\
+LOGSOFTMAX_EXCEED_AXIS1(I16, F16, vxc_short8, vxc_short8, half4,  vxc_short8,\\\n\
+CONV, 1, 0, VXC_VertMax3_Integer)\n\
+\n\
+LOGSOFTMAX_EXCEED_AXIS1(I8, I8,  vxc_char16, vxc_char16, char4,  vxc_char8,\\\n\
+CONV_SAT_RTE, outputScale, 0, VXC_VertMax3_Integer)\n\
+LOGSOFTMAX_EXCEED_AXIS1(I8, F16, vxc_char16, vxc_char16, half4,  vxc_short8,\\\n\
+CONV, 1, 0, VXC_VertMax3_Integer)\n\
+\n\
+LOGSOFTMAX_EXCEED_AXIS1(U8, U8,  vxc_uchar16, vxc_uchar16, uchar4,\\\n\
+vxc_uchar8, CONV_SAT_RTE, outputScale, output_offset_asymmetric, VXC_VertMax3_Integer)\n\
+LOGSOFTMAX_EXCEED_AXIS1(U8, F16, vxc_uchar16, vxc_uchar16, half4, \\\n\
+vxc_short8, CONV, 1, 0, VXC_VertMax3_Integer)\n\
+\n\
+\n\
+\n\
+#define LOGSOFTMAX_EXCEED_AXIS1_TOF32(src_name, src_type, copy_type, vert_max_fun) \\\n\
+__kernel void log_softmax_exceed_axis1_##src_name##toF32 \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    float input_Scale, \\\n\
+    int   axisVal \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), 0, 0, 0); \\\n\
+    src_type vec0, max; \\\n\
+    copy_type in0; \\\n\
+    vxc_float4 data0; \\\n\
+    vxc_float4 sum0, sum1; \\\n\
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS1(VXC_ReadImage2DArray, vert_max_fun) \\\n\
+    coord.y = 0; \\\n\
+    for (coord.z = 0; coord.z < depth; coord.z ++) \\\n\
+    { \\\n\
+        for (coord.y = 0; coord.y < axisSize;) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            _viv_asm(COPY, vec0, in0, 16); \\\n\
+            VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \\\n\
+                                  VXC_RM_TowardZero, 0), uniGetSubLoData_4x4); \\\n\
+            data0 = data0 * betaValue - sum0; \\\n\
+            write_imagef(output, coord, data0); \\\n\
+            coord.x += 4; \\\n\
+            VXC_DP4x4(data0, vec0, max, VXC_MODIFIER(0, 3, 0, \\\n\
+                                   VXC_RM_TowardZero, 0), uniGetSubHiData_4x4); \\\n\
+            data0 = data0 * betaValue - sum1; \\\n\
+            write_imagef(output, coord, data0); \\\n\
+            coord.x -= 4; \\\n\
+            coord.y++; \\\n\
+        } \\\n\
+    } \\\n\
+}\n\
+\n\
+LOGSOFTMAX_EXCEED_AXIS1_TOF32(F16, vxc_half8, \\\n\
+vxc_short8, VXC_VertMax3_Half)\n\
+LOGSOFTMAX_EXCEED_AXIS1_TOF32(I16, vxc_short8, \\\n\
+vxc_short8, VXC_VertMax3_Integer)\n\
+LOGSOFTMAX_EXCEED_AXIS1_TOF32(I8,  vxc_char16, \\\n\
+vxc_char16, VXC_VertMax3_Integer)\n\
+LOGSOFTMAX_EXCEED_AXIS1_TOF32(U8,  vxc_uchar16, \\\n\
+vxc_uchar16, VXC_VertMax3_Integer)"; /* end of log_softmax_exceed_axis1_vx*/
+
+static const char log_softmax_exceed_axis1_BF16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float       rlogE;\n\
+_viv_uniform int         depth;\n\
+_viv_uniform int         axisSize;\n\
+_viv_uniform float       betaValue;\n\
+_viv_uniform float       scaleLogE;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniExtractHalf8_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
+\n\
+#define LOGSOFTMAX_PROCESS_EXCEED_AXIS1_BF16(read_fun) \\\n\
+    read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, max, in0, 16); \\\n\
+    for (coord.z = 0; coord.z < depth; coord.z ++) \\\n\
+    { \\\n\
+        for (coord.y = 0; coord.y < axisSize;) \\\n\
+        { \\\n\
+            read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            _viv_asm(COPY, vec0, in0, 16); \\\n\
+            VXC_VertMax3_Half(max, max, max, vec0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.y++; \\\n\
+        } \\\n\
+    } \\\n\
+    _viv_asm(COPY, tmp0, max, 16); \\\n\
+    VXC_DP2x8(tmp1, tmp0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8); \\\n\
+    _viv_asm(COPY, max_lo, tmp1, 16); \\\n\
+    VXC_DP2x8(tmp1, tmp0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8); \\\n\
+    _viv_asm(COPY, max_hi, tmp1, 16); \\\n\
+    coord.y = 0; \\\n\
+    sum0 = 0; \\\n\
+    sum1 = 0; \\\n\
+    for (coord.z = 0; coord.z < depth; coord.z ++) \\\n\
+    { \\\n\
+        for (coord.y = 0; coord.y < axisSize;) \\\n\
+        { \\\n\
+            read_fun(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+                                                             uniConvBF16toF32_Part0_2x8); \\\n\
+            _viv_asm(COPY, data0, tmp1, 16); \\\n\
+            data0 = data0 - max_lo; \\\n\
+            data0 *= scaleLogE; \\\n\
+            sum0  += exp2(data0); \\\n\
+            VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+                                                              uniConvBF16toF32_Part1_2x8); \\\n\
+            _viv_asm(COPY, data0, tmp1, 16); \\\n\
+            data0 = data0 - max_hi; \\\n\
+            data0 *= scaleLogE; \\\n\
+            sum1  += exp2(data0); \\\n\
+            coord.y++; \\\n\
+        } \\\n\
+    } \\\n\
+    sum0 = log2(sum0) * rlogE; \\\n\
+    sum1 = log2(sum1) * rlogE;\n\
+\n\
+__kernel void log_softmax_exceed_axis1_BF16toBF16(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    float input_Scale,\n\
+    int   axisVal )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), 0, 0, 0);\n\
+    vxc_short8 in0;\n\
+    vxc_half8 vec0, max;\n\
+    vxc_float4 data0;\n\
+    vxc_float4 sum0, sum1;\n\
+    vxc_float4 max_lo, max_hi;\n\
+    vxc_ushort8   tmp0, tmp1;\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+\n\
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS1_BF16(VXC_ReadImage2DArray)\n\
+\n\
+    coord.y = 0;\n\
+    vxc_ushort8 dst0, dst1, dst;\n\
+    for (coord.z = 0; coord.z < depth; coord.z ++)\n\
+    {\n\
+        for (coord.y = 0; coord.y < axisSize;)\n\
+        {\n\
+            VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+            _viv_asm(COPY, data0, tmp1, 16);\n\
+            data0 = data0 - max_lo;\n\
+            data0 = data0 * betaValue - sum0;\n\
+            _viv_asm(COPY, dst0, data0, 16);\n\
+            VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+            _viv_asm(COPY, data0, tmp1, 16);\n\
+            data0 = data0 - max_hi;\n\
+            data0 = data0 * betaValue - sum1;\n\
+            _viv_asm(COPY, dst1, data0, 16);\n\
+            VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void log_softmax_exceed_axis1_BF16toF16(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    float input_Scale,\n\
+    int   axisVal )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), 0, 0, 0);\n\
+    vxc_short8 in0;\n\
+    vxc_half8 vec0, max;\n\
+    vxc_float4 data0;\n\
+    vxc_float4 sum0, sum1;\n\
+    vxc_float4 max_lo, max_hi;\n\
+    vxc_ushort8   tmp0, tmp1;\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+\n\
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS1_BF16(VXC_ReadImage2DArray)\n\
+\n\
+    coord.y = 0;\n\
+    half4 dst0, dst1;\n\
+    for (coord.z = 0; coord.z < depth; coord.z ++)\n\
+    {\n\
+        for (coord.y = 0; coord.y < axisSize;)\n\
+        {\n\
+            VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+            _viv_asm(COPY, data0, tmp1, 16);\n\
+            data0 = data0 - max_lo;\n\
+            data0 = data0 * betaValue - sum0;\n\
+            _viv_asm(CONV, dst0, data0);\n\
+            VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+            _viv_asm(COPY, data0, tmp1, 16);\n\
+            data0 = data0 - max_hi;\n\
+            data0 = data0 * betaValue - sum1;\n\
+            _viv_asm(CONV, dst1, data0);\n\
+            VXC_DP2x8(vec0, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8);\n\
+            vxc_short8 vect;\n\
+            _viv_asm(COPY, vect, vec0, 16);\n\
+            VXC_WriteImage2DArray(output, coord, vect, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void log_softmax_exceed_axis1_BF16toF32(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    float input_Scale,\n\
+    int   axisVal )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), 0, 0, 0);\n\
+    vxc_short8 in0;\n\
+    vxc_half8 vec0, max;\n\
+    vxc_float4 data0;\n\
+    vxc_float4 sum0, sum1;\n\
+    vxc_float4 max_lo, max_hi;\n\
+    vxc_ushort8   tmp0, tmp1;\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+\n\
+    LOGSOFTMAX_PROCESS_EXCEED_AXIS1_BF16(VXC_ReadImage2DArray)\n\
+\n\
+    coord.y = 0;\n\
+    for (coord.z = 0; coord.z < depth; coord.z ++)\n\
+    {\n\
+        for (coord.y = 0; coord.y < axisSize;)\n\
+        {\n\
+            VXC_ReadImage2DArray(in0, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+            _viv_asm(COPY, data0, tmp1, 16);\n\
+            data0 = data0 - max_lo;\n\
+            data0 = data0 * betaValue - sum0;\n\
+            write_imagef(output, coord, data0);\n\
+            coord.x += 4;\n\
+            VXC_DP2x8(tmp1, in0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+            _viv_asm(COPY, data0, tmp1, 16);\n\
+            data0 = data0 - max_hi;\n\
+            data0 = data0 * betaValue - sum1;\n\
+            write_imagef(output, coord, data0);\n\
+            coord.x -= 4;\n\
+            coord.y++;\n\
+        }\n\
+    }\n\
+}\n\
+"; /* end of log_softmax_exceed_axis1_BF16_vx*/
+
 static const char logical_not_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 __kernel void logical_not_I8toI8(\n\
@@ -36020,6 +38827,368 @@ NV12_COPY_SH_IMPL(U8toI16, vxc_short8, int4,  vxc_short8, 16)\n\
 NV12_COPY_SH_IMPL(U8toF16, vxc_half8,  half4, vxc_short8, 16)\n\
 "; /* end of pre_process_nv12_copy_vx*/
 
+static const char pre_process_nv12_rggb_copy_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int bOrder;\n\
+_viv_uniform int rOrder;\n\
+\n\
+_viv_uniform float outputScaleVar_b;\n\
+_viv_uniform float outputScaleVar_g;\n\
+_viv_uniform float outputScaleVar_r;\n\
+\n\
+_viv_uniform float bMeanScaleVarZp;\n\
+_viv_uniform float gMeanScaleVarZp;\n\
+_viv_uniform float rMeanScaleVarZp;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractYtoShortSub16_2x8;\n\
+_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4;\n\
+\n\
+#define NV12_RGGB_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \\\n\
+__kernel void pre_process_nv12_rggb_copy_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t y_img, \\\n\
+    __read_only  image2d_array_t uv_img, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    global       int*            xRatio, \\\n\
+    global       int*            yRatio, \\\n\
+    global       int*            xOffset, \\\n\
+    global       int*            yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           r_scale, \\\n\
+                 int             reverse_channel, \\\n\
+                 int             trans, \\\n\
+                 int             nv_type, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
+    ) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+ \\\n\
+    int sy = gidy + (*yOffset); \\\n\
+    int sx = gidx + (*xOffset); \\\n\
+    int uvX = sx & 0xfffffffe; \\\n\
+    int uvY = sy >> 1; \\\n\
+ \\\n\
+    vxc_uchar16 Y, UV; \\\n\
+ \\\n\
+    VXC_ReadImage(Y, y_img, (int2)(sx,sy), 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(UV, uv_img,(int2)(uvX,uvY), 0,VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+  \\\n\
+    if (nv_type == 3) \\\n\
+    { \\\n\
+        UV.s0123 = UV.s1032; \\\n\
+    } \\\n\
+ \\\n\
+    vxc_short8 tmpY; \\\n\
+    vxc_char16 tmpUV; \\\n\
+    short tmpVal = 16; \\\n\
+    VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractYtoShortSub16_2x8); \\\n\
+    tmpVal = 128; \\\n\
+    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \\\n\
+ \\\n\
+    float4 tmpDstB, tmpDstG, tmpDstR; \\\n\
+    vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \\\n\
+    VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \\\n\
+    VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \\\n\
+    VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \\\n\
+    VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \\\n\
+              VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\
+    VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \\\n\
+              VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\
+    VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \\\n\
+              VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\
+ \\\n\
+    conv_type result; \\\n\
+    dst_type dst0; \\\n\
+    save_type dst; \\\n\
+    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
+    tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstB); \\\n\
+    dstPos.z = bOrder; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstG); \\\n\
+    dstPos.z = 1; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    dstPos.z = 2; \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstR); \\\n\
+    dstPos.z = rOrder; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+NV12_RGGB_COPY_SH_IMPL(U8toU8,  vxc_uchar8, int4,  vxc_uchar8, 8)\n\
+NV12_RGGB_COPY_SH_IMPL(U8toI8,  vxc_char8,  int4,  vxc_char8,  8)\n\
+NV12_RGGB_COPY_SH_IMPL(U8toI16, vxc_short8, int4,  vxc_short8, 16)\n\
+NV12_RGGB_COPY_SH_IMPL(U8toF16, vxc_half8,  half4, vxc_short8, 16)\n\
+"; /* end of pre_process_nv12_rggb_copy_vx*/
+
+static const char pre_process_nv12_rggb_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int bOrder;\n\
+_viv_uniform int rOrder;\n\
+\n\
+_viv_uniform float outputScaleVar_b;\n\
+_viv_uniform float outputScaleVar_g;\n\
+_viv_uniform float outputScaleVar_r;\n\
+\n\
+_viv_uniform float bMeanScaleVarZp;\n\
+_viv_uniform float gMeanScaleVarZp;\n\
+_viv_uniform float rMeanScaleVarZp;\n\
+\n\
+_viv_uniform uint  xrIntFloat_16;\n\
+_viv_uniform uint  yrIntFloat_16;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;\n\
+_viv_uniform VXC_512Bits uniConvertYtoShortSub16_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniCalculateYShift_2x8;\n\
+_viv_uniform VXC_512Bits uniCalculateUVShift_2x8;\n\
+_viv_uniform VXC_512Bits uniConvertUchartoFp32_4x4;\n\
+\n\
+#define NV12_RGGB_OPT_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \\\n\
+__kernel void pre_process_nv12_rggb_scale_##name##_gq \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t y_img, \\\n\
+    __read_only  image2d_array_t uv_img, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    global       int*            xRatio, \\\n\
+    global       int*            yRatio, \\\n\
+    global       int*            xOffset, \\\n\
+    global       int*            yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           r_scale, \\\n\
+                 int             reverse_channel, \\\n\
+                 int             trans, \\\n\
+                 int             nv_type, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
+    ) \\\n\
+{ \\\n\
+    uint4 gidx = get_global_id(0); \\\n\
+    uint gidy = get_global_id(1); \\\n\
+    gidx += (uint4)(0, 1, 2, 3); \\\n\
+ \\\n\
+    uint dy = (gidy * yrIntFloat_16) >> 16; \\\n\
+    uint4 dx = (gidx * xrIntFloat_16) >> 16; \\\n\
+    int sy = convert_int(dy) + (*yOffset); \\\n\
+    int4 sx = convert_int4(dx) + (*xOffset); \\\n\
+    int4 uvX = sx & 0xfffffffe; \\\n\
+    int uvY = sy >> 1; \\\n\
+ \\\n\
+    vxc_uchar16 Y, UV; \\\n\
+    int2 coord = (int2)(sx.x, sy); \\\n\
+    int2 coord_uv = (int2)(uvX.x, uvY); \\\n\
+    VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    if (nv_type == 3) \\\n\
+    { \\\n\
+        UV.s0123456789abcdef = UV.s1032547698badcfe; \\\n\
+    } \\\n\
+ \\\n\
+    vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; \\\n\
+    vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; \\\n\
+    int4 offsetUV = uvX - uvX.x; \\\n\
+ \\\n\
+    vxc_ushort8 diffY, diffUV; \\\n\
+    _viv_asm(COPY, diffY, sx, 16); \\\n\
+    _viv_asm(COPY, diffUV, offsetUV, 16); \\\n\
+ \\\n\
+    vxc_ushort8 constData = 8; \\\n\
+    VXC_DP2x8(maskShift, diffY, constData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+                uniCalculateYShift_2x8); \\\n\
+    VXC_DP2x8(maskShiftUv, diffUV, constData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                uniCalculateUVShift_2x8); \\\n\
+    VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    vxc_short8 tmpY; \\\n\
+    vxc_char16 tmpUV; \\\n\
+    short tmpVal = 16; \\\n\
+    VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertYtoShortSub16_2x8); \\\n\
+    tmpVal = 128; \\\n\
+    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \\\n\
+ \\\n\
+    float4 tmpDstB, tmpDstG, tmpDstR; \\\n\
+    vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \\\n\
+    VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \\\n\
+    VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \\\n\
+    VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \\\n\
+    VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \\\n\
+                       VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\
+    VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \\\n\
+                       VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\
+    VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \\\n\
+                       VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\
+ \\\n\
+    conv_type result; \\\n\
+    dst_type dst0; \\\n\
+    save_type dst; \\\n\
+    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
+    tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstB); \\\n\
+    dstPos.z = bOrder; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstG); \\\n\
+    dstPos.z = 1; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    dstPos.z = 2; \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstR); \\\n\
+    dstPos.z = rOrder; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+NV12_RGGB_OPT_SH_IMPL(U8toU8,  vxc_uchar8, int4,  vxc_uchar8, 8)\n\
+NV12_RGGB_OPT_SH_IMPL(U8toI8,  vxc_char8,  int4,  vxc_char8,  8)\n\
+NV12_RGGB_OPT_SH_IMPL(U8toI16, vxc_short8, int4,  vxc_short8, 16)\n\
+NV12_RGGB_OPT_SH_IMPL(U8toF16, vxc_half8,  half4, vxc_short8, 16)\n\
+\n\
+#define NV12_RGGB_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \\\n\
+__kernel void pre_process_nv12_rggb_scale_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t y_img, \\\n\
+    __read_only  image2d_array_t uv_img, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    global       int*            xRatio, \\\n\
+    global       int*            yRatio, \\\n\
+    global       int*            xOffset, \\\n\
+    global       int*            yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           r_scale, \\\n\
+                 int             reverse_channel, \\\n\
+                 int             trans, \\\n\
+                 int             nv_type, \\\n\
+                 float           g_scale, \\\n\
+                 float           b_scale \\\n\
+    ) \\\n\
+{ \\\n\
+    uint4 gidx = get_global_id(0); \\\n\
+    uint gidy = get_global_id(1); \\\n\
+    gidx += (uint4)(0, 1, 2, 3); \\\n\
+ \\\n\
+    uint dy = (gidy * yrIntFloat_16) >> 16; \\\n\
+    uint4 dx = (gidx * xrIntFloat_16) >> 16; \\\n\
+    int sy = convert_int(dy) + (*yOffset); \\\n\
+    int4 sx = convert_int4(dx) + (*xOffset); \\\n\
+    int4 uvX = sx & 0xfffffffe; \\\n\
+    int uvY = sy >> 1; \\\n\
+ \\\n\
+    vxc_uchar16 Y, UV; \\\n\
+    int2 coord = (int2)(sx.x, sy); \\\n\
+    int2 coord_uv = (int2)(uvX.x, uvY); \\\n\
+ \\\n\
+    VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord.x = sx.y; \\\n\
+    VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord.x = sx.z; \\\n\
+    VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord.x = sx.w; \\\n\
+    VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_uv.x = uvX.y; \\\n\
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_uv.x = uvX.z; \\\n\
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_uv.x = uvX.w; \\\n\
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    if (nv_type == 3) \\\n\
+    { \\\n\
+        UV.s01234567 = UV.s10325476; \\\n\
+    } \\\n\
+ \\\n\
+    vxc_short8 tmpY; \\\n\
+    vxc_char16 tmpUV; \\\n\
+    short tmpVal = 16; \\\n\
+    VXC_DP2x8(tmpY, Y, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertYtoShortSub16_2x8); \\\n\
+    tmpVal = 128; \\\n\
+    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \\\n\
+ \\\n\
+    float4 tmpDstB, tmpDstG, tmpDstR; \\\n\
+    vxc_uchar4 DstB_uchar, DstG_uchar, DstR_uchar; \\\n\
+    VXC_DP4x4(DstB_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toB_4x4); \\\n\
+    VXC_DP4x4(DstG_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toG_4x4); \\\n\
+    VXC_DP4x4(DstR_uchar, tmpY, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniConvertNV12toR_4x4); \\\n\
+    VXC_DP4x4(tmpDstB, DstB_uchar, DstB_uchar, \\\n\
+              VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\
+    VXC_DP4x4(tmpDstG, DstG_uchar, DstG_uchar, \\\n\
+              VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\
+    VXC_DP4x4(tmpDstR, DstR_uchar, DstR_uchar, \\\n\
+              VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertUchartoFp32_4x4); \\\n\
+ \\\n\
+    conv_type result; \\\n\
+    dst_type dst0; \\\n\
+    save_type dst; \\\n\
+    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
+    tmpDstB = tmpDstB * outputScaleVar_b + bMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstB); \\\n\
+    dstPos.z = bOrder; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    tmpDstG = tmpDstG * outputScaleVar_g + gMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstG); \\\n\
+    dstPos.z = 1; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    dstPos.z = 2; \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    tmpDstR = tmpDstR * outputScaleVar_r + rMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstR); \\\n\
+    dstPos.z = rOrder; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+NV12_RGGB_SH_IMPL(U8toU8,  vxc_uchar8, int4,  vxc_uchar8, 8)\n\
+NV12_RGGB_SH_IMPL(U8toI8,  vxc_char8,  int4,  vxc_char8,  8)\n\
+NV12_RGGB_SH_IMPL(U8toI16, vxc_short8, int4,  vxc_short8, 16)\n\
+NV12_RGGB_SH_IMPL(U8toF16, vxc_half8,  half4, vxc_short8, 16)\n\
+"; /* end of pre_process_nv12_rggb_scale_vx*/
+
 static const char pre_process_nv12_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int bOrder;\n\
@@ -46004,80 +49173,79 @@ __kernel void resize_bilinear_F16toF16_DOWN\n\
         VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
-__kernel void resize_bilinear_F16toU8_DOWN\n\
-    (\n\
-    __read_only     image2d_array_t input,\n\
-    __write_only    image2d_array_t output,\n\
-    int   align_corners,\n\
-    int   half_pixel_centers\n\
-    )\n\
-{\n\
-    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-    int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\
-    float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;\n\
-    float4 left_x_f    = floor(in_x);\n\
-    float4 x_lerp      = in_x - left_x_f;\n\
-    int4   left_x_idx  = convert_int4(left_x_f);\n\
-    float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\
-    float  top_y_f     = floor(in_y);\n\
-    float  y_lerp      = in_y - top_y_f;\n\
-    int    top_y_idx   = convert_int(top_y_f);\n\
-\n\
-    vxc_short8 top_short, bottom_short;\n\
-    vxc_half8  top, bottom;\n\
-    int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\
-\n\
-    int8 input_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord_in.w, baseAddr);\n\
-    VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
-            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
-            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.y;\n\
-    VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
-            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
-            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.z;\n\
-    VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
-            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
-            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = left_x_idx.w;\n\
-    VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
-            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
-            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
-    _viv_asm(COPY, top,    top_short, 16);\n\
-    _viv_asm(COPY, bottom, bottom_short, 16);\n\
-\n\
-    float4 left4;\n\
-    float4 right4;\n\
-    float4 top4;\n\
-    float4 bottom4;\n\
-    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);\n\
-    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
-    top4        = right4 * x_lerp + left4;\n\
-    VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4);\n\
-    VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
-    bottom4      = right4 * x_lerp + left4;\n\
-    bottom4     -= top4;\n\
-    float4 dst4  = bottom4 * y_lerp + top4;\n\
-    dst4         = dst4 * uint8Scale + output_ZP;\n\
-    int4 dst     = convert_int4_rte(dst4);\n\
-    vxc_uchar8 dst_uchar;\n\
-    VXC_DP2x8(dst_uchar, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\
-\n\
-    int8 output_desc;\n\
-    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
-    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
-    _viv_asm(MOV, coord_out.w, baseAddr);\n\
-\n\
-    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst_uchar,\n\
-        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+#define RESIZE_BILINEAR_F16TOQINT_DOWN(out_name, dst_type) \\\n\
+__kernel void resize_bilinear_F16to##out_name##_DOWN( \\\n\
+    __read_only     image2d_array_t input, \\\n\
+    __write_only    image2d_array_t output, \\\n\
+    int   align_corners, \\\n\
+    int   half_pixel_centers \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3); \\\n\
+    float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value; \\\n\
+    float4 left_x_f    = floor(in_x); \\\n\
+    float4 x_lerp      = in_x - left_x_f; \\\n\
+    int4   left_x_idx  = convert_int4(left_x_f); \\\n\
+    float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value; \\\n\
+    float  top_y_f     = floor(in_y); \\\n\
+    float  y_lerp      = in_y - top_y_f; \\\n\
+    int    top_y_idx   = convert_int(top_y_f); \\\n\
+ \\\n\
+    vxc_short8 top_short, bottom_short; \\\n\
+    vxc_half8  top, bottom; \\\n\
+    int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0); \\\n\
+ \\\n\
+    int8 input_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = left_x_idx.y; \\\n\
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = left_x_idx.z; \\\n\
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_in.x = left_x_idx.w; \\\n\
+    VXC_OP4(img_load_3d, top_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_OP4(img_load_3d, bottom_short, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, top,    top_short, 16); \\\n\
+    _viv_asm(COPY, bottom, bottom_short, 16); \\\n\
+ \\\n\
+    float4 left4, right4, top4, bottom4; \\\n\
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4); \\\n\
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\
+    top4        = right4 * x_lerp + left4; \\\n\
+    VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_left_4x4); \\\n\
+    VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \\\n\
+    bottom4      = right4 * x_lerp + left4; \\\n\
+    bottom4     -= top4; \\\n\
+    float4 dst4  = bottom4 * y_lerp + top4; \\\n\
+    dst4         = dst4 * uint8Scale + output_ZP; \\\n\
+    int4 dst     = convert_int4_rte(dst4); \\\n\
+    dst_type dst_uchar; \\\n\
+    VXC_DP2x8(dst_uchar, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \\\n\
+ \\\n\
+    int8 output_desc; \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr); \\\n\
+ \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst_uchar, \\\n\
+        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
 }\n\
+RESIZE_BILINEAR_F16TOQINT_DOWN(U8, vxc_uchar8)\n\
+RESIZE_BILINEAR_F16TOQINT_DOWN(U16, vxc_ushort8)\n\
 \n\
 __kernel void resize_bilinear_F16toF16_UP\n\
     (\n\
@@ -46198,14 +49366,16 @@ __kernel void resize_bilinear_F16toF16_UP\n\
 
 static const char resize_bilinear_I16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;\n\
+_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;\n\
 _viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\
 _viv_uniform float2 scale_xy;\n\
 _viv_uniform int depth;\n\
+_viv_uniform int input_ZP;\n\
+_viv_uniform float uint8Scale;\n\
+_viv_uniform float output_ZP;\n\
 _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\
 _viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\
-_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;\n\
-_viv_uniform VXC_512Bits uniRightSubLeft_4x4;\n\
-_viv_uniform float dfpScale;\n\
 _viv_uniform float half_pixel_value;\n\
 \n\
 __kernel void resize_bilinear_I16toI16_UP\n\
@@ -46253,24 +49423,24 @@ __kernel void resize_bilinear_I16toI16_UP\n\
             VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
     VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1),\n\
             VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    short inputZP;\n\
+    _viv_asm(COPY, inputZP, input_ZP, 4);\n\
 \n\
     vxc_ushort8 bitextract_p0;\n\
     vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};\n\
     VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \\\n\
-    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);\n\
     vxc_ushort8 constData = 16;\n\
     VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\
-    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\
 \n\
     int8 output_desc;\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
     baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
     _viv_asm(MOV, coord_out.w, baseAddr);\n\
 \n\
-    float4 left4;\n\
-    float4 right4;\n\
-    float4 top4;\n\
-    float4 bottom4;\n\
+    float4 left4, right4, top4, bottom4;\n\
 \n\
     int loop = depth - 1;\n\
     while (coord_in.z < loop)\n\
@@ -46289,18 +49459,18 @@ __kernel void resize_bilinear_I16toI16_UP\n\
         VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1),\n\
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 \n\
-        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
-        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
+        VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
         top4        = right4 * x_lerp + left4;\n\
 \n\
-        VXC_DP4x4(left4, bottom, bottom, \\\n\
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
+        VXC_DP4x4(left4, bottom, inputZP, \\\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
         VXC_DP4x4(right4, bottom, bottom, \\\n\
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
         bottom4      = right4 * x_lerp + left4;\n\
         bottom4     -= top4;\n\
         float4 dst4  = bottom4 * y_lerp + top4;\n\
-        dst4         = dst4 * dfpScale;\n\
+        dst4         = dst4 * uint8Scale + output_ZP;\n\
         int4 dst     = convert_int4_rte(dst4);\n\
 \n\
         VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\
@@ -46313,17 +49483,17 @@ __kernel void resize_bilinear_I16toI16_UP\n\
     VXC_BitExtract(dst1, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
     _viv_asm(COPY, top, dst0, 16);\n\
     _viv_asm(COPY, bottom, dst1, 16);\n\
-    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
-    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
     top4        = right4 * x_lerp + left4;\n\
-    VXC_DP4x4(left4, bottom, bottom, \\\n\
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
+    VXC_DP4x4(left4, bottom, inputZP, \\\n\
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
     VXC_DP4x4(right4, bottom, bottom, \\\n\
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
     bottom4      = right4 * x_lerp + left4;\n\
     bottom4     -= top4;\n\
     float4 dst4  = bottom4 * y_lerp + top4;\n\
-    dst4         = dst4 * dfpScale;\n\
+    dst4         = dst4 * uint8Scale + output_ZP;\n\
     int4 dst     = convert_int4_rte(dst4);\n\
     VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\
     VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
@@ -46378,25 +49548,25 @@ __kernel void resize_bilinear_I16toI16_DOWN\n\
     VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
             VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
 \n\
-    float4 left4;\n\
-    float4 right4;\n\
-    float4 top4;\n\
-    float4 bottom4;\n\
+    short inputZP;\n\
+    _viv_asm(COPY, inputZP, input_ZP, 4);\n\
 \n\
-    VXC_DP4x4(left4, top, top, \\\n\
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
-    VXC_DP4x4(right4, top, top, \\\n\
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
+    float4 left4, right4, top4, bottom4;\n\
+\n\
+    VXC_DP4x4(left4, top, inputZP, \\\n\
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, top, inputZP, \\\n\
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
     top4        = right4 * x_lerp + left4;\n\
 \n\
-    VXC_DP4x4(left4, bottom, bottom, \\\n\
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
-    VXC_DP4x4(right4, bottom, bottom, \\\n\
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
+    VXC_DP4x4(left4, bottom, inputZP, \\\n\
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, bottom, inputZP, \\\n\
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
     bottom4      = right4 * x_lerp + left4;\n\
     bottom4     -= top4;\n\
     float4 dst4  = bottom4 * y_lerp + top4;\n\
-    dst4         = dst4 * dfpScale;\n\
+    dst4         = dst4 * uint8Scale + output_ZP;\n\
     int4 dst     = convert_int4_rte(dst4);\n\
 \n\
     VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\
@@ -46407,21 +49577,23 @@ __kernel void resize_bilinear_I16toI16_DOWN\n\
     _viv_asm(MOV, coord_out.w, baseAddr);\n\
 \n\
     VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,\n\
-        VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+                    VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
 "; /* end of resize_bilinear_I16_vx*/
 
 static const char resize_bilinear_I8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;\n\
+_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;\n\
 _viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\
 _viv_uniform float2 scale_xy;\n\
 _viv_uniform int depth;\n\
+_viv_uniform int input_ZP;\n\
+_viv_uniform float uint8Scale;\n\
+_viv_uniform float output_ZP;\n\
 _viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\
 _viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\
-_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;\n\
-_viv_uniform VXC_512Bits uniRightSubLeft_4x4;\n\
-_viv_uniform float dfpScale;\n\
 _viv_uniform float half_pixel_value;\n\
 \n\
 __kernel void resize_bilinear_I8toI8_UP\n\
@@ -46465,14 +49637,17 @@ __kernel void resize_bilinear_I8toI8_UP\n\
             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
     VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    short inputZP;\n\
+    _viv_asm(COPY, inputZP, input_ZP, 4);\n\
 \n\
     vxc_ushort8 bitextract_p0;\n\
     vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\
     VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \\\n\
-    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);\n\
     vxc_ushort8 constData = 8;\n\
     VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\
-    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\
 \n\
     int8 output_desc;\n\
     _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
@@ -46498,22 +49673,22 @@ __kernel void resize_bilinear_I8toI8_UP\n\
         VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
                 VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
 \n\
-        VXC_DP4x4(left4, top, top, \\\n\
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
+        VXC_DP4x4(left4, top, inputZP, \\\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
         VXC_DP4x4(right4, top, top, \\\n\
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
 \n\
         top4        = right4 * x_lerp + left4;\n\
 \n\
-        VXC_DP4x4(left4, bottom, bottom, \\\n\
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
+        VXC_DP4x4(left4, bottom, inputZP, \\\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
         VXC_DP4x4(right4, bottom, bottom, \\\n\
-        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
 \n\
         bottom4      = right4 * x_lerp + left4;\n\
         bottom4     -= top4;\n\
         float4 dst4  = bottom4 * y_lerp + top4;\n\
-        dst4         = dst4 * dfpScale;\n\
+        dst4         = dst4 * uint8Scale + output_ZP;\n\
         int4 dst     = convert_int4_rte(dst4);\n\
         VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\
 \n\
@@ -46525,19 +49700,19 @@ __kernel void resize_bilinear_I8toI8_UP\n\
     VXC_BitExtract(dst1, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
     _viv_asm(COPY, top, dst0, 16);\n\
     _viv_asm(COPY, bottom, dst1, 16);\n\
-    VXC_DP4x4(left4, top, top, \\\n\
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
+    VXC_DP4x4(left4, top, inputZP, \\\n\
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
     VXC_DP4x4(right4, top, top, \\\n\
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
     top4        = right4 * x_lerp + left4;\n\
-    VXC_DP4x4(left4, bottom, bottom, \\\n\
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
+    VXC_DP4x4(left4, bottom, inputZP, \\\n\
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
     VXC_DP4x4(right4, bottom, bottom, \\\n\
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
     bottom4      = right4 * x_lerp + left4;\n\
     bottom4     -= top4;\n\
     float4 dst4  = bottom4 * y_lerp + top4;\n\
-    dst4         = dst4 * dfpScale;\n\
+    dst4         = dst4 * uint8Scale + output_ZP;\n\
     int4 dst     = convert_int4_rte(dst4);\n\
     VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\
     VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
@@ -46587,26 +49762,29 @@ __kernel void resize_bilinear_I8toI8_DOWN\n\
             VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
     VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
             VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    short inputZP;\n\
+    _viv_asm(COPY, inputZP, input_ZP, 4);\n\
 \n\
     float4 left4;\n\
     float4 right4;\n\
     float4 top4;\n\
     float4 bottom4;\n\
 \n\
-    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
-    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
     top4        = right4 * x_lerp + left4;\n\
 \n\
-    VXC_DP4x4(left4, bottom, bottom, \\\n\
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4);\n\
-    VXC_DP4x4(right4, bottom, bottom, \\\n\
-    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4);\n\
+    VXC_DP4x4(left4, bottom, inputZP, \\\n\
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, bottom, inputZP, \\\n\
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
     bottom4      = right4 * x_lerp + left4;\n\
 \n\
     bottom4     -= top4;\n\
     float4 dst4  = bottom4 * y_lerp + top4;\n\
 \n\
-    dst4         = dst4 * dfpScale;\n\
+    dst4         = dst4 * uint8Scale + output_ZP;\n\
 \n\
     int4 dst     = convert_int4_rte(dst4);\n\
 \n\
@@ -46620,6 +49798,286 @@ __kernel void resize_bilinear_I8toI8_DOWN\n\
 }\n\
 "; /* end of resize_bilinear_I8_vx*/
 
+static const char resize_bilinear_U16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;\n\
+_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;\n\
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;\n\
+_viv_uniform float2 scale_xy;\n\
+_viv_uniform int depth;\n\
+_viv_uniform int input_ZP;\n\
+_viv_uniform float uint8Scale;\n\
+_viv_uniform float output_ZP;\n\
+_viv_uniform VXC_512Bits uniConvertI32toI16_2x8;\n\
+_viv_uniform VXC_512Bits uniGetMaskShift_2x8;\n\
+_viv_uniform float half_pixel_value;\n\
+\n\
+__kernel void resize_bilinear_U16toF16_DOWN\n\
+    (\n\
+    __read_only     image2d_array_t input,\n\
+    __write_only    image2d_array_t output,\n\
+    int   align_corners,\n\
+    int   half_pixel_centers\n\
+    )\n\
+{\n\
+    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\
+    float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;\n\
+    float4 left_x_f    = floor(in_x);\n\
+    float4 x_lerp      = in_x - left_x_f;\n\
+    int4   left_x_idx  = convert_int4(left_x_f);\n\
+    float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\
+    float  top_y_f     = floor(in_y);\n\
+    float  y_lerp      = in_y - top_y_f;\n\
+    int    top_y_idx   = convert_int(top_y_f);\n\
+    vxc_ushort8 top, bottom;\n\
+    int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = left_x_idx.y;\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = left_x_idx.z;\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = left_x_idx.w;\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    float4 left4, right4, top4, bottom4;\n\
+\n\
+    short inputZP;\n\
+    _viv_asm(COPY, inputZP, input_ZP, 4);\n\
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
+    top4        = right4 * x_lerp + left4;\n\
+\n\
+    VXC_DP4x4(left4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
+    bottom4      = right4 * x_lerp + left4;\n\
+\n\
+    bottom4     -= top4;\n\
+    float4 dst4  = bottom4 * y_lerp + top4;\n\
+\n\
+    dst4 *=  uint8Scale;\n\
+\n\
+    half4 dst;\n\
+    _viv_asm(CONV, dst, dst4);\n\
+\n\
+    vxc_short8 dst_short;\n\
+    _viv_asm(COPY, dst_short, dst, 16);\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst_short.s0246,\n\
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void resize_bilinear_U16toU16_UP\n\
+    (\n\
+    image2d_array_t input,\n\
+    image2d_array_t output,\n\
+    int   align_corners,\n\
+    int   half_pixel_centers\n\
+    )\n\
+{\n\
+    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\
+    float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;\n\
+\n\
+    float4 left_x_f    = floor(in_x);\n\
+    float4 x_lerp      = in_x - left_x_f;\n\
+    int4   left_x_idx  = convert_int4(left_x_f);\n\
+    float4 right_x_f   = ceil(in_x);\n\
+    int4   right_x_idx = convert_int4(right_x_f);\n\
+\n\
+    float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\
+\n\
+    float  top_y_f     = floor(in_y);\n\
+    float  y_lerp      = in_y - top_y_f;\n\
+    int    top_y_idx   = convert_int(top_y_f);\n\
+\n\
+    vxc_ushort8 src0, src1, src2, src3;\n\
+\n\
+    vxc_ushort8 top;\n\
+    vxc_ushort8 bottom;\n\
+\n\
+    int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+\n\
+    VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1),\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    short inputZP;\n\
+    _viv_asm(COPY, inputZP, input_ZP, 4);\n\
+\n\
+    vxc_ushort8 bitextract_p0;\n\
+    vxc_uchar16 maskShift = {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16};\n\
+    VXC_DP2x8(bitextract_p0, left_x_idx, right_x_idx, \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertI32toI16_2x8);\n\
+    vxc_ushort8 constData = 16;\n\
+    VXC_DP2x8(maskShift, bitextract_p0, constData, \\\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniGetMaskShift_2x8);\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+    float4 left4, right4, top4, bottom4;\n\
+\n\
+    int loop = depth - 1;\n\
+    while (coord_in.z < loop)\n\
+    {\n\
+        VXC_BitExtract(top, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_BitExtract(bottom, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_in.zw += (int2)(1, input_desc.s4);\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 0),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(8, 1),\n\
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+        VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
+        top4        = right4 * x_lerp + left4;\n\
+\n\
+        VXC_DP4x4(left4, bottom, inputZP, \\\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
+        VXC_DP4x4(right4, bottom, bottom, \\\n\
+                    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
+        bottom4      = right4 * x_lerp + left4;\n\
+        bottom4     -= top4;\n\
+        float4 dst4  = bottom4 * y_lerp + top4;\n\
+        dst4         = dst4 * uint8Scale + output_ZP;\n\
+        int4 dst     = convert_int4_rte(dst4);\n\
+        VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+        coord_out.zw += (int2)(1, output_desc.s4);\n\
+    }\n\
+\n\
+    VXC_BitExtract(top, src0, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_BitExtract(bottom, src2, src3, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
+    top4        = right4 * x_lerp + left4;\n\
+\n\
+    VXC_DP4x4(left4, bottom, inputZP, \\\n\
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, bottom, bottom, \\\n\
+                VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
+    bottom4      = right4 * x_lerp + left4;\n\
+    bottom4     -= top4;\n\
+    float4 dst4  = bottom4 * y_lerp + top4;\n\
+    dst4         = dst4 * uint8Scale + output_ZP;\n\
+    int4 dst     = convert_int4_rte(dst4);\n\
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void resize_bilinear_U16toU16_DOWN\n\
+    (\n\
+    __read_only     image2d_array_t input,\n\
+    __write_only    image2d_array_t output,\n\
+    int   align_corners,\n\
+    int   half_pixel_centers\n\
+    )\n\
+{\n\
+    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4   coord_x     = coord_out.xxxx + (int4)(0, 1, 2, 3);\n\
+    float4 in_x        = (convert_float4(coord_x) + half_pixel_value) * scale_xy.xxxx - half_pixel_value;\n\
+    float4 left_x_f    = floor(in_x);\n\
+    float4 x_lerp      = in_x - left_x_f;\n\
+    int4   left_x_idx  = convert_int4(left_x_f);\n\
+    float  in_y        = (convert_float(coord_out.y) + half_pixel_value) * scale_xy.y - half_pixel_value;\n\
+    float  top_y_f     = floor(in_y);\n\
+    float  y_lerp      = in_y - top_y_f;\n\
+    int    top_y_idx   = convert_int(top_y_f);\n\
+    vxc_ushort8 top, bottom, result;\n\
+    int4 coord_in = (int4)(left_x_idx.x, top_y_idx, coord_out.z, 0);\n\
+\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_in.w, baseAddr);\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = left_x_idx.y;\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = left_x_idx.z;\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = left_x_idx.w;\n\
+    VXC_OP4(img_load_3d, top, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, bottom, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+            VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    float4 left4, right4, top4, bottom4;\n\
+\n\
+    short inputZP;\n\
+    _viv_asm(COPY, inputZP, input_ZP, 4);\n\
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
+    top4        = right4 * x_lerp + left4;\n\
+\n\
+    VXC_DP4x4(left4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
+    VXC_DP4x4(right4, bottom, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
+    bottom4      = right4 * x_lerp + left4;\n\
+\n\
+    bottom4     -= top4;\n\
+    float4 dst4  = bottom4 * y_lerp + top4;\n\
+\n\
+    dst4         = dst4 * uint8Scale + output_ZP;\n\
+    int4 dst     = convert_int4_rte(dst4);\n\
+\n\
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8);\n\
+\n\
+    int8 output_desc;\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc));\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0;\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result,\n\
+                   VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of resize_bilinear_U16_vx*/
+
 static const char resize_bilinear_U8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;\n\
@@ -46754,6 +50212,9 @@ __kernel void resize_bilinear_U8toU8_UP\n\
             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
     VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
             VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    short inputZP;\n\
+    _viv_asm(COPY, inputZP, input_ZP, 4);\n\
 \n\
     vxc_ushort8 bitextract_p0;\n\
     vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\
@@ -46785,8 +50246,6 @@ __kernel void resize_bilinear_U8toU8_UP\n\
         VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
                 VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
 \n\
-        unsigned char inputZP;\n\
-        _viv_asm(COPY, inputZP, input_ZP, 4);\n\
         VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
         VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
         top4        = right4 * x_lerp + left4;\n\
@@ -46807,8 +50266,7 @@ __kernel void resize_bilinear_U8toU8_UP\n\
 \n\
     VXC_BitExtract(top, src0, src0, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
     VXC_BitExtract(bottom, src1, src1, maskShift, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    unsigned char inputZP;\n\
-    _viv_asm(COPY, inputZP, input_ZP, 4);\n\
+\n\
     VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4);\n\
     VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4);\n\
     top4        = right4 * x_lerp + left4;\n\
@@ -47825,6 +51283,277 @@ __kernel void resize_bilinear_nhwc_bound_U8toU8_4x\n\
 }\n\
 "; /* end of resize_bilinear_nhwc_bound_vx*/
 
+static const char resize_cubic_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float input_scale;\n\
+_viv_uniform float input_tail;\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_tail;\n\
+_viv_uniform VXC_512Bits uniFp16ToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniExtractHalf8_2x8;\n\
+_viv_uniform VXC_512Bits uniExtract8Bit_2x8;\n\
+\n\
+#define RESIZE_CUBIC_PART0 \\\n\
+    int4   coord_out    = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    int4   coord_index  = coord_out; \\\n\
+    int2   coord_scalew = (int2)(4 * get_global_id(0), 0); \\\n\
+    int2   coord_scaleh = (int2)(4 * get_global_id(1), 0); \\\n\
+    float4 cubic_coeffs_y; \\\n\
+    float4 cubic_coeffs_x; \\\n\
+    int4   coord_in     = (int4)(0, 0, coord_out.z, 0); \\\n\
+    float4 src0_f,src1_f,src2_f,src3_f; \\\n\
+    float4 dst = (float4)(0,0,0,0); \\\n\
+    float  sum[4]; \\\n\
+    int i = 0; \\\n\
+ \\\n\
+    Image scalew = create_image_from_image2d(scale_w, 4); \\\n\
+    Image scaleh = create_image_from_image2d(scale_h, 4); \\\n\
+ \\\n\
+    uchar* scale_w_ptr = get_image_ptr_from_coord(scalew, coord_scalew); \\\n\
+    __global float* scale_x = (__global float*)scale_w_ptr; \\\n\
+ \\\n\
+    uchar* scale_h_ptr = get_image_ptr_from_coord(scaleh, coord_scaleh); \\\n\
+    __global float* scale_y = (__global float*)scale_h_ptr; \\\n\
+    cubic_coeffs_y = vload4(0, scale_y); \\\n\
+ \\\n\
+    int index_y = read_imagei(index_h, coord_index.yw).x; \\\n\
+    coord_in.y = index_y; \\\n\
+    int8 input_desc, output_desc; \\\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
+    int baseAddr = (int)coord_in.z * input_desc.s4 + input_desc.s0; \\\n\
+    _viv_asm(MOV, coord_in.w, baseAddr); \\\n\
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \\\n\
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \\\n\
+    _viv_asm(MOV, coord_out.w, baseAddr);\n\
+\n\
+#define RESIZE_CUBIC_16Bitsto16Bits(name,src_type,dst_type,temp_type) \\\n\
+__kernel void resize_cubic_##name( \\\n\
+    __read_only  image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    __read_only  image2d_t        scale_w, \\\n\
+    __read_only  image2d_t        scale_h, \\\n\
+    __read_only  image2d_t        index_w, \\\n\
+    __read_only  image2d_t        index_h \\\n\
+                           ) \\\n\
+{ \\\n\
+    RESIZE_CUBIC_PART0; \\\n\
+    src_type     src0_h,src1_h,src2_h,src3_h; \\\n\
+    vxc_short4   src0,src1,src2,src3; \\\n\
+    for (i = 0; i < 4; i++) \\\n\
+    { \\\n\
+        coord_in.x = read_imagei(index_w, coord_index.xw).x; \\\n\
+        cubic_coeffs_x = vload4(i, scale_x); \\\n\
+        coord_index.x = coord_index.x + 1; \\\n\
+ \\\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+        _viv_asm(COPY, src0_h, src0, 8); \\\n\
+        _viv_asm(COPY, src1_h, src1, 8); \\\n\
+        _viv_asm(COPY, src2_h, src2, 8); \\\n\
+        _viv_asm(COPY, src3_h, src3, 8); \\\n\
+ \\\n\
+        VXC_DP4x4(src0_f, src0_h, src0_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\
+        VXC_DP4x4(src1_f, src1_h, src1_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\
+        VXC_DP4x4(src2_f, src2_h, src2_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\
+        VXC_DP4x4(src3_f, src3_h, src3_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\
+ \\\n\
+        dst = src0_f * cubic_coeffs_y.x \\\n\
+            + src1_f * cubic_coeffs_y.y \\\n\
+            + src2_f * cubic_coeffs_y.z \\\n\
+            + src3_f * cubic_coeffs_y.w; \\\n\
+        sum[i] = dot(dst, cubic_coeffs_x); \\\n\
+    } \\\n\
+    float4 sum_f = (float4)(sum[0],sum[1],sum[2],sum[3]); \\\n\
+    temp_type tmpout; \\\n\
+    _viv_asm(CONV,tmpout,sum_f); \\\n\
+    dst_type out_h; \\\n\
+    vxc_short4 out; \\\n\
+    VXC_DP2x8(out_h, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \\\n\
+    _viv_asm(COPY, out, out_h, 8); \\\n\
+ \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+RESIZE_CUBIC_16Bitsto16Bits(F16toF16,vxc_half4, vxc_half4, half4)\n\
+RESIZE_CUBIC_16Bitsto16Bits(I16toI16,vxc_short4,vxc_short4,short4)\n\
+RESIZE_CUBIC_16Bitsto16Bits(F16toI16,vxc_half4, vxc_short4,short4)\n\
+RESIZE_CUBIC_16Bitsto16Bits(I16toF16,vxc_short4,vxc_half4, half4)\n\
+\n\
+\n\
+#define RESIZE_CUBIC_Quant8toQuant8(name,data_type) \\\n\
+__kernel void resize_cubic_##name( \\\n\
+    __read_only  image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    __read_only  image2d_t        scale_w, \\\n\
+    __read_only  image2d_t        scale_h, \\\n\
+    __read_only  image2d_t        index_w, \\\n\
+    __read_only  image2d_t        index_h \\\n\
+                           ) \\\n\
+{ \\\n\
+    RESIZE_CUBIC_PART0; \\\n\
+    data_type   src0,src1,src2,src3; \\\n\
+    for (i = 0; i < 4; i++) \\\n\
+    { \\\n\
+        coord_in.x = read_imagei(index_w, coord_index.xw).x; \\\n\
+        cubic_coeffs_x = vload4(i, scale_x); \\\n\
+        coord_index.x = coord_index.x + 1; \\\n\
+ \\\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\
+        VXC_DP4x4(src1_f, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\
+        VXC_DP4x4(src2_f, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\
+        VXC_DP4x4(src3_f, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\
+        src0_f = src0_f * input_scale + input_tail; \\\n\
+        src1_f = src1_f * input_scale + input_tail; \\\n\
+        src2_f = src2_f * input_scale + input_tail; \\\n\
+        src3_f = src3_f * input_scale + input_tail; \\\n\
+ \\\n\
+        dst = src0_f * cubic_coeffs_y.x \\\n\
+            + src1_f * cubic_coeffs_y.y \\\n\
+            + src2_f * cubic_coeffs_y.z \\\n\
+            + src3_f * cubic_coeffs_y.w; \\\n\
+        sum[i] = dot(dst, cubic_coeffs_x); \\\n\
+        sum[i] = sum[i] * output_scale + output_tail; \\\n\
+    } \\\n\
+    float4 sum_f = (float4)(sum[0],sum[1],sum[2],sum[3]); \\\n\
+    int4 tmpout; \\\n\
+    _viv_asm(CONV,tmpout,sum_f); \\\n\
+    data_type out; \\\n\
+    VXC_DP2x8(out, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \\\n\
+ \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+RESIZE_CUBIC_Quant8toQuant8(U8toU8,vxc_uchar4)\n\
+RESIZE_CUBIC_Quant8toQuant8(I8toI8,vxc_char4 )\n\
+\n\
+#define RESIZE_CUBIC_F16toQuant8(name,dst_type) \\\n\
+__kernel void resize_cubic_##name( \\\n\
+    __read_only  image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    __read_only  image2d_t        scale_w, \\\n\
+    __read_only  image2d_t        scale_h, \\\n\
+    __read_only  image2d_t        index_w, \\\n\
+    __read_only  image2d_t        index_h \\\n\
+                           ) \\\n\
+{ \\\n\
+    RESIZE_CUBIC_PART0; \\\n\
+    vxc_half4    src0_h,src1_h,src2_h,src3_h; \\\n\
+    vxc_short4   src0,src1,src2,src3; \\\n\
+    for (i = 0; i < 4; i++) \\\n\
+    { \\\n\
+        coord_in.x = read_imagei(index_w, coord_index.xw).x; \\\n\
+        cubic_coeffs_x = vload4(i, scale_x); \\\n\
+        coord_index.x = coord_index.x + 1; \\\n\
+ \\\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+        _viv_asm(COPY, src0_h, src0, 8); \\\n\
+        _viv_asm(COPY, src1_h, src1, 8); \\\n\
+        _viv_asm(COPY, src2_h, src2, 8); \\\n\
+        _viv_asm(COPY, src3_h, src3, 8); \\\n\
+ \\\n\
+        VXC_DP4x4(src0_f, src0_h, src0_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\
+        VXC_DP4x4(src1_f, src1_h, src1_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\
+        VXC_DP4x4(src2_f, src2_h, src2_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\
+        VXC_DP4x4(src3_f, src3_h, src3_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\
+ \\\n\
+        dst = src0_f * cubic_coeffs_y.x \\\n\
+            + src1_f * cubic_coeffs_y.y \\\n\
+            + src2_f * cubic_coeffs_y.z \\\n\
+            + src3_f * cubic_coeffs_y.w; \\\n\
+        sum[i] = dot(dst, cubic_coeffs_x); \\\n\
+        sum[i] = sum[i] * output_scale + output_tail; \\\n\
+    } \\\n\
+    float4 sum_f = (float4)(sum[0],sum[1],sum[2],sum[3]); \\\n\
+    int4 tmpout; \\\n\
+    _viv_asm(CONV,tmpout,sum_f); \\\n\
+    dst_type out; \\\n\
+    VXC_DP2x8(out, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtract8Bit_2x8); \\\n\
+ \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+RESIZE_CUBIC_F16toQuant8(F16toU8,vxc_uchar4)\n\
+RESIZE_CUBIC_F16toQuant8(F16toI8,vxc_char4)\n\
+\n\
+#define RESIZE_CUBIC_Quant8toF16(name,src_type) \\\n\
+__kernel void resize_cubic_##name( \\\n\
+    __read_only  image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    __read_only  image2d_t        scale_w, \\\n\
+    __read_only  image2d_t        scale_h, \\\n\
+    __read_only  image2d_t        index_w, \\\n\
+    __read_only  image2d_t        index_h \\\n\
+                           ) \\\n\
+{ \\\n\
+    RESIZE_CUBIC_PART0; \\\n\
+    src_type src0,src1,src2,src3; \\\n\
+    for (i = 0; i < 4; i++) \\\n\
+    { \\\n\
+        coord_in.x = read_imagei(index_w, coord_index.xw).x; \\\n\
+        cubic_coeffs_x = vload4(i, scale_x); \\\n\
+        coord_index.x = coord_index.x + 1; \\\n\
+ \\\n\
+        VXC_OP4(img_load_3d, src0, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 0), \\\n\
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, src1, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 1), \\\n\
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, src2, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 2), \\\n\
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+        VXC_OP4(img_load_3d, src3, input, coord_in.xywz, VXC_5BITOFFSET_XY(0, 3), \\\n\
+                                         VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+        VXC_DP4x4(src0_f, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\
+        VXC_DP4x4(src1_f, src1, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\
+        VXC_DP4x4(src2_f, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\
+        VXC_DP4x4(src3_f, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16ToFp32_4x4); \\\n\
+ \\\n\
+        src0_f = src0_f * input_scale + input_tail; \\\n\
+        src1_f = src1_f * input_scale + input_tail; \\\n\
+        src2_f = src2_f * input_scale + input_tail; \\\n\
+        src3_f = src3_f * input_scale + input_tail; \\\n\
+ \\\n\
+        dst = src0_f * cubic_coeffs_y.x \\\n\
+            + src1_f * cubic_coeffs_y.y \\\n\
+            + src2_f * cubic_coeffs_y.z \\\n\
+            + src3_f * cubic_coeffs_y.w; \\\n\
+        sum[i] = dot(dst, cubic_coeffs_x); \\\n\
+    } \\\n\
+    float4 sum_f = (float4)(sum[0],sum[1],sum[2],sum[3]); \\\n\
+    half4 tmpout; \\\n\
+    _viv_asm(CONV,tmpout,sum_f); \\\n\
+    vxc_half4 out_h; \\\n\
+    vxc_short4 out; \\\n\
+    VXC_DP2x8(out_h, tmpout, tmpout, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtractHalf8_2x8); \\\n\
+    _viv_asm(COPY, out, out_h, 8); \\\n\
+ \\\n\
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, out, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+RESIZE_CUBIC_Quant8toF16(U8toF16,vxc_uchar4)\n\
+RESIZE_CUBIC_Quant8toF16(I8toF16,vxc_char4)"; /* end of resize_cubic_vx*/
+
 static const char resize_nearest_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniMultiplyAndPostShift_2x8;\n\
@@ -49593,6 +53322,379 @@ SCATTER_ND_UPDATE_COPY(F16, vxc_short8, 2, short)\n\
 SCATTER_ND_UPDATE_COPY(BF16, vxc_short8, 2, short)\n\
 "; /* end of scatter_nd_update_qint_vx*/
 
+static const char scatter_nd_update_reduction_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int update_width;\n\
+_viv_uniform int output_width;\n\
+\n\
+_viv_uniform int4 coord_stride;\n\
+_viv_uniform int4 coord_stride1;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+\n\
+_viv_uniform int input_zp;\n\
+_viv_uniform float input_scale;\n\
+_viv_uniform int update_zp;\n\
+_viv_uniform float update_scale;\n\
+_viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvert2ndU8SubZpToFp32_4x4;\n\
+\n\
+inline void AtomicAdd_float(volatile __global float *source, const float operand)\n\
+{\n\
+    union\n\
+    {\n\
+        unsigned int intVal;\n\
+        float floatVal;\n\
+    } newVal;\n\
+    union\n\
+    {\n\
+        unsigned int intVal;\n\
+        float floatVal;\n\
+    } prevVal;\n\
+    do\n\
+    {\n\
+        prevVal.floatVal = *source;\n\
+        newVal.floatVal = prevVal.floatVal + operand;\n\
+    } while(atomic_cmpxchg((volatile __global unsigned int *)source,\n\
+                             prevVal.intVal, newVal.intVal) != prevVal.intVal);\n\
+}\n\
+\n\
+inline void AtomicMul_float(volatile __global float *source, const float operand)\n\
+{\n\
+    union\n\
+    {\n\
+        unsigned int intVal;\n\
+        float floatVal;\n\
+    } newVal;\n\
+    union\n\
+    {\n\
+        unsigned int intVal;\n\
+        float floatVal;\n\
+    } prevVal;\n\
+    do\n\
+    {\n\
+        prevVal.floatVal = *source;\n\
+        newVal.floatVal = prevVal.floatVal * operand;\n\
+    } while(atomic_cmpxchg((volatile __global unsigned int *)source,\n\
+                             prevVal.intVal, newVal.intVal) != prevVal.intVal);\n\
+}\n\
+\n\
+inline void AtomicMax_float(volatile __global float *source, const float operand)\n\
+{\n\
+    union\n\
+    {\n\
+        unsigned int intVal;\n\
+        float floatVal;\n\
+    } newVal;\n\
+    union\n\
+    {\n\
+        unsigned int intVal;\n\
+        float floatVal;\n\
+    } prevVal;\n\
+    do\n\
+    {\n\
+        prevVal.floatVal = *source;\n\
+        newVal.floatVal = fmax(prevVal.floatVal, operand);\n\
+    } while(atomic_cmpxchg((volatile __global unsigned int *)source,\n\
+                             prevVal.intVal, newVal.intVal) != prevVal.intVal);\n\
+}\n\
+\n\
+inline void AtomicMin_float(volatile __global float *source, const float operand)\n\
+{\n\
+    union\n\
+    {\n\
+        unsigned int intVal;\n\
+        float floatVal;\n\
+    } newVal;\n\
+    union\n\
+    {\n\
+        unsigned int intVal;\n\
+        float floatVal;\n\
+    } prevVal;\n\
+    do\n\
+    {\n\
+        prevVal.floatVal = *source;\n\
+        newVal.floatVal = fmin(prevVal.floatVal, operand);\n\
+    } while(atomic_cmpxchg((volatile __global unsigned int *)source,\n\
+                             prevVal.intVal, newVal.intVal) != prevVal.intVal);\n\
+}\n\
+\n\
+#define SCATTER_REDUCTION_PREPROCESS(name0, ptr0, type0, len0, size0, ptr2) \\\n\
+__kernel void scatter_nd_update_reduction_preprocess_##name0( \\\n\
+    __read_only image2d_t   input_ref, \\\n\
+    image2d_t  temp_buf_float, \\\n\
+    int length, int res) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    Image img1 = create_image_from_image2d(input_ref, size0); \\\n\
+    Image img2 = create_image_from_image2d(temp_buf_float, 4); \\\n\
+    __global float* tmp_ref_ptr = (__global float*)img2.ptr; \\\n\
+    type0 src; \\\n\
+    float4 tmpDst0, tmpDst1; \\\n\
+    short zp = input_zp; \\\n\
+    if(length > 0) \\\n\
+    { \\\n\
+        __global ptr0* input_ptr = (__global ptr0*)img1.ptr; \\\n\
+        ptr0 tmpData = input_ptr[gidx]; \\\n\
+        int loc2 = gidx * 8; \\\n\
+        _viv_asm(COPY, src, tmpData, len0); \\\n\
+        VXC_DP4x4(tmpDst0, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+                    uniConvert1stUint8SubZpToFp32_4x4); \\\n\
+        VXC_DP4x4(tmpDst1, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+                    uniConvert2ndU8SubZpToFp32_4x4); \\\n\
+        tmpDst0 *= input_scale; \\\n\
+        tmpDst1 *= input_scale; \\\n\
+        vstore4(tmpDst0, 0, tmp_ref_ptr + loc2); \\\n\
+        vstore4(tmpDst1, 1, tmp_ref_ptr + loc2); \\\n\
+    } \\\n\
+    __global ptr2* input_ptr1 = (__global ptr2*)img1.ptr; \\\n\
+    for(int i = gidx; i < res; i += get_global_size(0)) \\\n\
+    { \\\n\
+        ptr2 tmpData1 = input_ptr1[length + i]; \\\n\
+        _viv_asm(COPY, src, tmpData1, 4); \\\n\
+        VXC_DP4x4(tmpDst0, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+                    uniConvert1stUint8SubZpToFp32_4x4); \\\n\
+        tmp_ref_ptr[length + i] = tmpDst0.x; \\\n\
+    } \\\n\
+}\n\
+SCATTER_REDUCTION_PREPROCESS(U8,  vxc_uchar8, vxc_uchar8, 8,  1, uchar)\n\
+SCATTER_REDUCTION_PREPROCESS(I8,  vxc_char8,  vxc_char8,  8,  1, char)\n\
+SCATTER_REDUCTION_PREPROCESS(I16, vxc_short8, vxc_short8, 16, 2, short)\n\
+SCATTER_REDUCTION_PREPROCESS(F16, vxc_short8, vxc_half8,  16, 2, short)\n\
+\n\
+#define SCATTER_ND_REDUCTION_PROCESS_F16(name0, func) \\\n\
+__kernel void scatter_nd_update_reduction_##name0##_F16( \\\n\
+    __read_only image2d_t   index, \\\n\
+    __read_only image2d_t   update, \\\n\
+    image2d_t  temp_buf_float, \\\n\
+    image2d_t  link_buffer0, \\\n\
+    int width, int area, int vol, int val4, \\\n\
+    int val5, int val6, int val7, int coord_dim) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    Image img1 = create_image_from_image2d(index, 4); \\\n\
+    Image img2 = create_image_from_image2d(update, 2); \\\n\
+    Image img3 = create_image_from_image2d(temp_buf_float, 4); \\\n\
+    __global int* index_ptr = (__global int*)img1.ptr; \\\n\
+    __global short* update_ptr = (__global short*)img2.ptr; \\\n\
+    __global float* output_ptr = (__global float*)img3.ptr; \\\n\
+    half src; \\\n\
+ \\\n\
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\
+    short tmpData = update_ptr[gidy * update_width + gidx]; \\\n\
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\
+    int loc = idx * output_width + gidx; \\\n\
+    _viv_asm(COPY, src, tmpData, 4); \\\n\
+    float data; \\\n\
+    _viv_asm(CONV, data, src); \\\n\
+    func(output_ptr + loc, data); \\\n\
+}\n\
+SCATTER_ND_REDUCTION_PROCESS_F16(Add,  AtomicAdd_float)\n\
+SCATTER_ND_REDUCTION_PROCESS_F16(Mul,  AtomicMul_float)\n\
+SCATTER_ND_REDUCTION_PROCESS_F16(Max,  AtomicMax_float)\n\
+SCATTER_ND_REDUCTION_PROCESS_F16(Min,  AtomicMin_float)\n\
+\n\
+#define SCATTER_ND_REDUCTION_PROCESS_BF16(name0, func) \\\n\
+__kernel void scatter_nd_update_reduction_##name0##_BF16( \\\n\
+    __read_only image2d_t   index, \\\n\
+    __read_only image2d_t   update, \\\n\
+    image2d_t  temp_buf_float, \\\n\
+    image2d_t  link_buffer0, \\\n\
+    int width, int area, int vol, int val4, \\\n\
+    int val5, int val6, int val7, int coord_dim) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    Image img1 = create_image_from_image2d(index, 4); \\\n\
+    Image img2 = create_image_from_image2d(update, 2); \\\n\
+    Image img3 = create_image_from_image2d(temp_buf_float, 4); \\\n\
+    __global int* index_ptr = (__global int*)img1.ptr; \\\n\
+    __global short* update_ptr = (__global short*)img2.ptr; \\\n\
+    __global float* output_ptr = (__global float*)img3.ptr; \\\n\
+    half src; \\\n\
+ \\\n\
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\
+    short tmpData = update_ptr[gidy * update_width + gidx]; \\\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0); \\\n\
+    vxc_short8 src0, src1; \\\n\
+    float data; \\\n\
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\
+    int loc = idx * output_width + gidx; \\\n\
+    _viv_asm(COPY, src0, tmpData, 4); \\\n\
+    VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniConvBF16toF32_Part0_2x8); \\\n\
+    _viv_asm(COPY, data, src1, 4); \\\n\
+    func(output_ptr + loc, data); \\\n\
+}\n\
+SCATTER_ND_REDUCTION_PROCESS_BF16(Add,  AtomicAdd_float)\n\
+SCATTER_ND_REDUCTION_PROCESS_BF16(Mul,  AtomicMul_float)\n\
+SCATTER_ND_REDUCTION_PROCESS_BF16(Max,  AtomicMax_float)\n\
+SCATTER_ND_REDUCTION_PROCESS_BF16(Min,  AtomicMin_float)\n\
+\n\
+#define SCATTER_ND_UPDATE_PROCESS_QINT(name0, src0_type, data_type, ptr_type, element_size, func) \\\n\
+__kernel void scatter_nd_update_reduction_##name0##_##src0_type( \\\n\
+    __read_only image2d_t   index, \\\n\
+    __read_only image2d_t   update, \\\n\
+    image2d_t  temp_buf_float, \\\n\
+    image2d_t  link_buffer0, \\\n\
+    int width, int area, int vol, int val4, \\\n\
+    int val5, int val6, int val7, int coord_dim) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    Image img1 = create_image_from_image2d(index, 4); \\\n\
+    Image img2 = create_image_from_image2d(update, element_size); \\\n\
+    Image img3 = create_image_from_image2d(temp_buf_float, 4); \\\n\
+    __global int* index_ptr = (__global int*)img1.ptr; \\\n\
+    __global ptr_type* update_ptr = (__global ptr_type*)img2.ptr; \\\n\
+    __global float* output_ptr = (__global float*)img3.ptr; \\\n\
+    data_type src; \\\n\
+ \\\n\
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\
+    ptr_type tmpData = update_ptr[gidy * update_width + gidx]; \\\n\
+    short zp = update_zp; \\\n\
+    int4 tmpOffset = indice * coord_stride + indice1 * coord_stride1; \\\n\
+    int idx = tmpOffset.x + tmpOffset.y + tmpOffset.z + tmpOffset.w; \\\n\
+    int loc = idx * output_width + gidx; \\\n\
+    _viv_asm(COPY, src, tmpData, 4); \\\n\
+    vxc_float4 data; \\\n\
+    VXC_DP4x4(data, src, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+                    uniConvert1stUint8SubZpToFp32_4x4); \\\n\
+    data.x *= update_scale; \\\n\
+    func(output_ptr + loc, data.x); \\\n\
+}\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Add, U8,  vxc_uchar8, uchar, 1, AtomicAdd_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Mul, U8,  vxc_uchar8, uchar, 1, AtomicMul_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Max, U8,  vxc_uchar8, uchar, 1, AtomicMax_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Min, U8,  vxc_uchar8, uchar, 1, AtomicMin_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Add, I8,  vxc_char8,  char,  1, AtomicAdd_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Mul, I8,  vxc_char8,  char,  1, AtomicMul_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Max, I8,  vxc_char8,  char,  1, AtomicMax_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Min, I8,  vxc_char8,  char,  1, AtomicMin_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Add, I16, vxc_short8, short, 2, AtomicAdd_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Mul, I16, vxc_short8, short, 2, AtomicMul_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Max, I16, vxc_short8, short, 2, AtomicMax_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Min, I16, vxc_short8, short, 2, AtomicMin_float)\n\
+"; /* end of scatter_nd_update_reduction_vx*/
+
+static const char scatter_nd_update_reduction_conv_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
+\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractHalf8_2x8;\n\
+\n\
+#define SCATTER_ND_UPDATE_CONV(src0_type, ptr_type, element_size, ptr_type1, conv_func) \\\n\
+__kernel void scatter_nd_update_reduction_conv_##src0_type( \\\n\
+    __read_only image2d_t  temp_buf_float, \\\n\
+    __read_only image2d_t  link_buf, \\\n\
+    image2d_t  output, \\\n\
+    int length, int res) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    Image img1 = create_image_from_image2d(temp_buf_float, 4); \\\n\
+    Image img2 = create_image_from_image2d(output, element_size); \\\n\
+    __global float* input_ptr = (__global float*)img1.ptr; \\\n\
+    if(length > 0) \\\n\
+    { \\\n\
+        __global ptr_type* output_ptr = (__global ptr_type*)img2.ptr; \\\n\
+        float4 src0 = vload4(0, input_ptr + gidx * 8); \\\n\
+        float4 src1 = vload4(1, input_ptr + gidx * 8); \\\n\
+        int4 data0 = convert_int4_rte(src0 * output_scale + output_zp); \\\n\
+        int4 data1 = convert_int4_rte(src1 * output_scale + output_zp); \\\n\
+        ptr_type dst; \\\n\
+        VXC_DP2x8(dst, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                    uniConvertInt32toUint8_2x8); \\\n\
+        output_ptr[gidx] = dst; \\\n\
+    } \\\n\
+    __global ptr_type1* output_ptr1 = (__global ptr_type1*)img2.ptr; \\\n\
+    for(int i = gidx; i < res; i += get_global_size(0)) \\\n\
+    { \\\n\
+        float src = input_ptr[length + i]; \\\n\
+        int data = convert_int_rte(src * output_scale + output_zp); \\\n\
+        output_ptr1[length + i] = conv_func(data); \\\n\
+    } \\\n\
+}\n\
+SCATTER_ND_UPDATE_CONV(U8,  vxc_uchar8, 1, uchar, convert_uchar)\n\
+SCATTER_ND_UPDATE_CONV(I8,  vxc_char8,  1, char,  convert_char)\n\
+SCATTER_ND_UPDATE_CONV(I16, vxc_short8, 2, short, convert_short)\n\
+\n\
+__kernel void scatter_nd_update_reduction_conv_F16(\n\
+    __read_only image2d_t  temp_buf_float,\n\
+    __read_only image2d_t  link_buf,\n\
+    image2d_t  output,\n\
+    int length, int res)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    Image img1 = create_image_from_image2d(temp_buf_float, 4);\n\
+    Image img2 = create_image_from_image2d(output, 2);\n\
+    __global float* input_ptr = (__global float*)img1.ptr;\n\
+    if(length > 0)\n\
+    {\n\
+        __global vxc_short8* output_ptr = (__global vxc_short8*)img2.ptr;\n\
+        float4 src0 = vload4(0, input_ptr + gidx * 8);\n\
+        float4 src1 = vload4(1, input_ptr + gidx * 8);\n\
+        half4 data0, data1;\n\
+        _viv_asm(CONV, data0, src0);\n\
+        _viv_asm(CONV, data1, src1);\n\
+        vxc_half8 tmp;\n\
+        vxc_short8 dst;\n\
+        VXC_DP2x8(tmp, data0, data1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtractHalf8_2x8);\n\
+        _viv_asm(COPY, dst, tmp, 16);\n\
+        output_ptr[gidx] = dst;\n\
+    }\n\
+    __global short* output_ptr1 = (__global short*)img2.ptr;\n\
+    for(int i = gidx; i < res; i += get_global_size(0))\n\
+    {\n\
+        float src = input_ptr[length + i];\n\
+        half data;\n\
+        _viv_asm(CONV, data, src);\n\
+        short dst;\n\
+        _viv_asm(COPY, dst, data, 4);\n\
+        output_ptr1[length + i] = dst;\n\
+    }\n\
+}\n\
+\n\
+__kernel void scatter_nd_update_reduction_conv_BF16(\n\
+    __read_only image2d_t  temp_buf_float,\n\
+    __read_only image2d_t  link_buf,\n\
+    image2d_t  output,\n\
+    int length, int res)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    Image img1 = create_image_from_image2d(temp_buf_float, 4);\n\
+    Image img2 = create_image_from_image2d(output, 2);\n\
+    __global float* input_ptr = (__global float*)img1.ptr;\n\
+    if(length > 0)\n\
+    {\n\
+        __global vxc_short8* output_ptr = (__global vxc_short8*)img2.ptr;\n\
+        float4 src0 = vload4(0, input_ptr + gidx * 8);\n\
+        float4 src1 = vload4(1, input_ptr + gidx * 8);\n\
+        vxc_short8 dst0, dst1, dst;\n\
+        _viv_asm(COPY, dst0, src0, 16);\n\
+        _viv_asm(COPY, dst1, src1, 16);\n\
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+        output_ptr[gidx] = dst;\n\
+    }\n\
+    __global short* output_ptr1 = (__global short*)img2.ptr;\n\
+    for(int i = gidx; i < res; i += get_global_size(0))\n\
+    {\n\
+        float src = input_ptr[length + i];\n\
+        vxc_short8 data;\n\
+        _viv_asm(COPY, data, src, 4);\n\
+        output_ptr1[length + i] = data.x;\n\
+    }\n\
+}\n\
+"; /* end of scatter_nd_update_reduction_conv_vx*/
+
 static const char scatter_nd_update_special_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniU8MulAndPostShift0_Lo_2x8;\n\
@@ -52516,10 +56618,10 @@ do\\\n\
 #define VXC_VertMax3_Integer(dst, src0, src1, src2, info)\\\n\
 do\\\n\
 {\\\n\
-    int startBin     = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\
-    int endBin         = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\
-    int sourceBin     = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\
-    int mod1 = VXC_MODIFIER_CLAMP(startBin, endBin, sourceBin, 0);\\\n\
+    constant int startBin     = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\
+    constant int endBin       = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\
+    constant int sourceBin    = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\
+    constant int mod1 = VXC_MODIFIER_CLAMP(startBin, endBin, sourceBin, 0);\\\n\
     typeof (dst) tmp;\\\n\
     tmp = max(src0, src1);\\\n\
     tmp = max(src2, tmp);\\\n\
@@ -52544,10 +56646,10 @@ do\\\n\
 #define VXC_HorzMax3_Integer(dst, src0, info)\\\n\
 do\\\n\
 {\\\n\
-    int startBin     = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\
-    int endBin         = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\
-    int sourceBin     = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\
-    int clamp         = (info & VXC_CLAMP_BITMASK) >> 22;\\\n\
+    constant int startBin     = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\
+    constant int endBin       = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\
+    constant int sourceBin    = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\
+    constant int clamp        = (info & VXC_CLAMP_BITMASK) >> 22;\\\n\
     int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\\\n\
     VXC_OP4(filter, dst, src0, src0, src0, mod1);\\\n\
 } while (0)\n\
@@ -52555,12 +56657,12 @@ do\\\n\
 #define VXC_HorzMax3_Half(dst, src0, info)\\\n\
 do\\\n\
 {\\\n\
-    int startBin     = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\
-    int endBin         = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\
-    int sourceBin     = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\
-    int clamp         = (info & VXC_CLAMP_BITMASK) >> 22;\\\n\
-    int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\\\n\
-    int mod2 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\\\n\
+    constant int startBin    = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\
+    constant int endBin      = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\
+    constant int sourceBin   = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\
+    constant int clamp       = (info & VXC_CLAMP_BITMASK) >> 22;\\\n\
+    constant int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\\\n\
+    constant int mod2 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\\\n\
     vxc_short8 val0, minVal, maxVal;\\\n\
     _viv_asm(COPY, val0, src0, 16);\\\n\
     VXC_OP4(filter, maxVal, val0, val0, val0, mod1);\\\n\
@@ -52572,24 +56674,24 @@ do\\\n\
 #define VXC_HorzMin3_Integer(dst, src0, info)\\\n\
 do\\\n\
 {\\\n\
-    int startBin     = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\
-    int endBin         = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\
-    int sourceBin     = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\
-    int clamp         = (info & VXC_CLAMP_BITMASK) >> 22;\\\n\
-    int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\\\n\
+    constant int startBin     = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\
+    constant int endBin       = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\
+    constant int sourceBin    = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\
+    constant int clamp        = (info & VXC_CLAMP_BITMASK) >> 22;\\\n\
+    constant int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\\\n\
     VXC_OP4(filter, dst, src0, src0, src0, mod1);\\\n\
 } while (0)\n\
 \n\
 #define VXC_HorzMin3_Half(dst, src0, info)\\\n\
 do\\\n\
 {\\\n\
-    int startBin     = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\
-    int endBin         = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\
-    int sourceBin     = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\
-    int clamp         = (info & VXC_CLAMP_BITMASK) >> 22;\\\n\
-    int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\\\n\
-    int mod2 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\\\n\
-    int mod3 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Median, clamp);\\\n\
+    constant int startBin     = (info & VXC_START_BIN_BITMASK) >> 12;\\\n\
+    constant int endBin       = (info & VXC_END_BIN_BITMASK) >> 8;\\\n\
+    constant int sourceBin    = (info & VXC_SOURCE_BIN_BITMASK) >> 4;\\\n\
+    constant int clamp        = (info & VXC_CLAMP_BITMASK) >> 22;\\\n\
+    constant int mod1 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Max, clamp);\\\n\
+    constant int mod2 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Min, clamp);\\\n\
+    constant int mod3 = VXC_MODIFIER_FILTER(startBin, endBin, sourceBin, VXC_FM_Median, clamp);\\\n\
     vxc_short8 val0, minVal, maxVal, midVal;\\\n\
     _viv_asm(COPY, val0, src0, 16);\\\n\
     VXC_OP4(filter, maxVal, val0, val0, val0, mod1);\\\n\
@@ -54928,6 +59030,192 @@ __kernel void clip_U8toF32_2D(\n\
 }\n\
 "; /* end of clip_U8_cl*/
 
+static const char crop_and_resize_bilinear_cl[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+\n\
+_viv_uniform float width_scale;\n\
+_viv_uniform float height_scale;\n\
+_viv_uniform int   image_width;\n\
+_viv_uniform int   image_height;\n\
+\n\
+#define CROP_AND_RESIZE_BILINEAR(name, read_type, dst_type, conv_type, write_type) \\\n\
+__kernel void crop_and_resize_bilinear_##name \\\n\
+( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __read_only image2d_t         boxes, \\\n\
+    __read_only image2d_t         box_ind, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 uint             ori_depth, \\\n\
+                 uint             ori_batchout, \\\n\
+                 float            inOutScale, \\\n\
+                 float            inOutTile, \\\n\
+                 float            extrapolation_value \\\n\
+) \\\n\
+{ \\\n\
+    int bb = get_global_id(2); \\\n\
+    int y =  get_global_id(1); \\\n\
+    int x = get_global_id(0); \\\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    int2 coord_box_ind = (int2)(bb, 0); \\\n\
+    int b = read_imagei(box_ind, coord_box_ind).x; \\\n\
+    float4 xy; \\\n\
+    float in_x, in_y; \\\n\
+    int d = 0; \\\n\
+ \\\n\
+    Image img_boxes = create_image_from_image2d(boxes, 2); \\\n\
+    __global half* boxes_ptr = (__global half*)img_boxes.ptr; \\\n\
+    xy = vload_half4(bb, boxes_ptr); \\\n\
+    float _width_scale = convert_float(xy.w - xy.y) * width_scale; \\\n\
+    float _height_scale = convert_float(xy.z - xy.x) * height_scale; \\\n\
+    if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \\\n\
+    if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \\\n\
+    in_y = xy.x * convert_float(image_height - 1) + convert_float(y) * _height_scale; \\\n\
+    in_x = xy.y * convert_float(image_width - 1) + convert_float(x) * _width_scale; \\\n\
+    float y_lerp = in_y - floor(in_y); \\\n\
+    float x_lerp = in_x - floor(in_x); \\\n\
+    float4 src0, src1, src2, src3; \\\n\
+    for (d = 0; d < ori_depth; d++) \\\n\
+    { \\\n\
+        int4 coord = (int4)(floor(in_x), floor(in_y), d + b * ori_depth, 0); \\\n\
+        if (coord.x < 0 || coord.x > image_width - 1 || coord.y < 0 || coord.y > image_height - 1) \\\n\
+        { \\\n\
+            src0 = (float4)(extrapolation_value,0,0,0); \\\n\
+        } \\\n\
+        else \\\n\
+        { \\\n\
+            src0 = convert_float4(read_type(input, coord)); \\\n\
+        } \\\n\
+        coord.x = coord.x + 1; \\\n\
+        if (coord.x < 0 || coord.x > image_width - 1 || coord.y < 0 || coord.y > image_height - 1) \\\n\
+        { \\\n\
+            src1 = (float4)(extrapolation_value,0,0,0); \\\n\
+        } \\\n\
+        else \\\n\
+        { \\\n\
+            src1 = convert_float4(read_type(input, coord)); \\\n\
+        } \\\n\
+        coord.y = coord.y + 1; \\\n\
+        if (coord.x < 0 || coord.x > image_width - 1 || coord.y < 0 || coord.y > image_height - 1) \\\n\
+        { \\\n\
+            src3 = (float4)(extrapolation_value,0,0,0); \\\n\
+        } \\\n\
+        else \\\n\
+        { \\\n\
+            src3 = convert_float4(read_type(input, coord)); \\\n\
+        } \\\n\
+        coord.x = coord.x - 1; \\\n\
+        if (coord.x < 0 || coord.x > image_width - 1 || coord.y < 0 || coord.y > image_height - 1) \\\n\
+        { \\\n\
+            src2 = (float4)(extrapolation_value,0,0,0); \\\n\
+        } \\\n\
+        else \\\n\
+        { \\\n\
+            src2 = convert_float4(read_type(input, coord)); \\\n\
+        } \\\n\
+        float4 top = src0 + (src1 - src0) * x_lerp; \\\n\
+        float4 bottom = src2 + (src3 - src2) * x_lerp; \\\n\
+        float4 value = top + (bottom - top) * y_lerp; \\\n\
+        value = value * inOutScale + inOutTile; \\\n\
+        dst_type dst = conv_type(value); \\\n\
+        coord_out.z = d + coord_out.z * ori_depth; \\\n\
+        write_type(output, coord_out, dst); \\\n\
+    } \\\n\
+}\n\
+\n\
+CROP_AND_RESIZE_BILINEAR(U32toU32,read_imageui, \\\n\
+uint4, convert_uint4, write_imageui)\n\
+CROP_AND_RESIZE_BILINEAR(U32toF32,read_imageui, \\\n\
+float4,convert_float4,write_imagef)\n\
+CROP_AND_RESIZE_BILINEAR(F32toF32,read_imagef, \\\n\
+float4, convert_float4,write_imagef)\n\
+CROP_AND_RESIZE_BILINEAR(F32toU32,read_imagef, \\\n\
+uint4,  convert_uint4, write_imageui)\n\
+CROP_AND_RESIZE_BILINEAR(F32toI32,read_imagef, \\\n\
+int4,   convert_int4,  write_imagei)\n\
+CROP_AND_RESIZE_BILINEAR(I32toI32,read_imagei,  \\\n\
+int4,  convert_int4,  write_imagei)\n\
+CROP_AND_RESIZE_BILINEAR(I32toF32,read_imagei,  \\\n\
+float4,convert_float4,write_imagef)"; /* end of crop_and_resize_bilinear_cl*/
+
+static const char crop_and_resize_nearest_neighbor_cl[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+\n\
+_viv_uniform float width_scale;\n\
+_viv_uniform float height_scale;\n\
+_viv_uniform int   image_width;\n\
+_viv_uniform int   image_height;\n\
+\n\
+#define CROP_AND_RESIZE_NEAREST_NEIGHTBOR(name,src_type, read_type, dst_type, conv_type, write_type) \\\n\
+__kernel void crop_and_resize_nearest_neighbor_##name \\\n\
+( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __read_only image2d_t         boxes, \\\n\
+    __read_only image2d_t         box_ind, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 uint             ori_depth, \\\n\
+                 uint             ori_batchout, \\\n\
+                 float            inOutScale, \\\n\
+                 float            inOutTile, \\\n\
+                 float            extrapolation_value \\\n\
+) \\\n\
+{ \\\n\
+    int bb = get_global_id(2); \\\n\
+    int y =  get_global_id(1); \\\n\
+    int x = get_global_id(0); \\\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    int2 coord_box_ind = (int2)(bb, 0); \\\n\
+    int b = read_imagei(box_ind, coord_box_ind).x; \\\n\
+    float4 xy; \\\n\
+    int in_x, in_y, d = 0; \\\n\
+ \\\n\
+    Image img_boxes = create_image_from_image2d(boxes, 2); \\\n\
+    __global half* boxes_ptr = (__global half*)img_boxes.ptr; \\\n\
+    xy = vload_half4(bb, boxes_ptr); \\\n\
+    float _width_scale = convert_float(xy.w - xy.y) * width_scale; \\\n\
+    float _height_scale = convert_float(xy.z - xy.x) * height_scale; \\\n\
+    if (_width_scale == 0) xy.y = 0.5 * (xy.y + xy.w); \\\n\
+    if (_height_scale == 0) xy.x = 0.5 * (xy.x + xy.z); \\\n\
+    in_y = convert_int(round(xy.x * convert_float(image_height - 1) \\\n\
+                                  + convert_float(y) * _height_scale)); \\\n\
+    in_x = convert_int(round(xy.y * convert_float(image_width - 1) \\\n\
+                                  + convert_float(x) * _width_scale)); \\\n\
+    for (d = 0; d < ori_depth; d++) \\\n\
+    { \\\n\
+        int4 coord = (int4)(in_x, in_y, d + b * ori_depth, 0); \\\n\
+        float4 src_f; \\\n\
+        if (coord.x < 0 || coord.x > image_width - 1 || coord.y < 0 || coord.y > image_height - 1) \\\n\
+        { \\\n\
+            src_f = (float4)(extrapolation_value, 0, 0, 0); \\\n\
+        } \\\n\
+        else \\\n\
+        { \\\n\
+            src_type src = read_type(input, coord); \\\n\
+            src_f = convert_float4(src); \\\n\
+        } \\\n\
+        src_f = src_f * inOutScale + inOutTile; \\\n\
+        dst_type dst = conv_type(src_f); \\\n\
+        coord_out.z = d + coord_out.z * ori_depth; \\\n\
+        write_type(output, coord_out, dst); \\\n\
+    } \\\n\
+}\n\
+\n\
+CROP_AND_RESIZE_NEAREST_NEIGHTBOR(U32toU32,uint4, \\\n\
+read_imageui, uint4, convert_uint4, write_imageui)\n\
+CROP_AND_RESIZE_NEAREST_NEIGHTBOR(U32toF32,uint4, \\\n\
+read_imageui, float4,convert_float4,write_imagef)\n\
+CROP_AND_RESIZE_NEAREST_NEIGHTBOR(F32toF32,float4, \\\n\
+read_imagef, float4,convert_float4,write_imagef)\n\
+CROP_AND_RESIZE_NEAREST_NEIGHTBOR(F32toU32,float4, \\\n\
+read_imagef, uint4, convert_uint4, write_imageui)\n\
+CROP_AND_RESIZE_NEAREST_NEIGHTBOR(F32toI32,float4, \\\n\
+read_imagef, int4,  convert_int4,  write_imagei)\n\
+CROP_AND_RESIZE_NEAREST_NEIGHTBOR(I32toI32,int4,  \\\n\
+read_imagei,  int4,  convert_int4,  write_imagei)\n\
+CROP_AND_RESIZE_NEAREST_NEIGHTBOR(I32toF32,int4,  \\\n\
+read_imagei,  float4,convert_float4,write_imagef)"; /* end of crop_and_resize_nearest_neighbor_cl*/
+
 static const char cumsum_cl[] = "__kernel void cumsum_F32toF32_axis2(\n\
     __read_only image2d_array_t  input,\n\
     __write_only image2d_array_t  output,\n\
@@ -56334,6 +60622,11 @@ float eltwise_unary_inverse_sigmoid(float x, float alpha, float beta)\n\
     return log(x1 / x2);\n\
 }\n\
 \n\
+float eltwise_unary_tan(float x, float alpha, float beta)\n\
+{\n\
+    return native_tan(x);\n\
+}\n\
+\n\
 \n\
 #define ELTWISE_UNARY_F32_2D(func_name) \\\n\
 __kernel void func_name##_F32toF32_2D \\\n\
@@ -56376,6 +60669,7 @@ ELTWISE_UNARY_F32_2D(atan)\n\
 ELTWISE_UNARY_F32_2D(atanh)\n\
 ELTWISE_UNARY_F32_2D(acosh)\n\
 ELTWISE_UNARY_F32_2D(inverse_sigmoid)\n\
+ELTWISE_UNARY_F32_2D(tan)\n\
 \n\
 #define ELTWISE_UNARY_U8_2D(func_name) \\\n\
 __kernel void func_name##_U8toU8_2D \\\n\
@@ -56419,6 +60713,7 @@ ELTWISE_UNARY_U8_2D(atan)\n\
 ELTWISE_UNARY_U8_2D(atanh)\n\
 ELTWISE_UNARY_U8_2D(acosh)\n\
 ELTWISE_UNARY_U8_2D(inverse_sigmoid)\n\
+ELTWISE_UNARY_U8_2D(tan)\n\
 \n\
 #define ELTWISE_UNARY_U8toF32_2D(func_name) \\\n\
 __kernel void func_name##_U8toF32_2D \\\n\
@@ -56461,6 +60756,7 @@ ELTWISE_UNARY_U8toF32_2D(atan)\n\
 ELTWISE_UNARY_U8toF32_2D(atanh)\n\
 ELTWISE_UNARY_U8toF32_2D(acosh)\n\
 ELTWISE_UNARY_U8toF32_2D(inverse_sigmoid)\n\
+ELTWISE_UNARY_U8toF32_2D(tan)\n\
 \n\
 __kernel void neg_I32toI32_2D\n\
     (\n\
@@ -56660,6 +60956,11 @@ float eltwise_unary_inverse_sigmoid(float x, float alpha, float beta)\n\
     return log(x1 / x2);\n\
 }\n\
 \n\
+float eltwise_unary_tan(float x, float alpha, float beta)\n\
+{\n\
+    return native_tan(x);\n\
+}\n\
+\n\
 #define ELTWISE_UNARY_F32(func_name) \\\n\
 __kernel void func_name##_F32toF32 \\\n\
     ( \\\n\
@@ -56701,6 +61002,7 @@ ELTWISE_UNARY_F32(atan)\n\
 ELTWISE_UNARY_F32(atanh)\n\
 ELTWISE_UNARY_F32(acosh)\n\
 ELTWISE_UNARY_F32(inverse_sigmoid)\n\
+ELTWISE_UNARY_F32(tan)\n\
 \n\
 #define ELTWISE_UNARY_U8(func_name) \\\n\
 __kernel void func_name##_U8toU8 \\\n\
@@ -56744,6 +61046,7 @@ ELTWISE_UNARY_U8(atan)\n\
 ELTWISE_UNARY_U8(atanh)\n\
 ELTWISE_UNARY_U8(acosh)\n\
 ELTWISE_UNARY_U8(inverse_sigmoid)\n\
+ELTWISE_UNARY_U8(tan)\n\
 \n\
 #define ELTWISE_UNARY_U8toF32(func_name) \\\n\
 __kernel void func_name##_U8toF32 \\\n\
@@ -56786,6 +61089,7 @@ ELTWISE_UNARY_U8toF32(atan)\n\
 ELTWISE_UNARY_U8toF32(atanh)\n\
 ELTWISE_UNARY_U8toF32(acosh)\n\
 ELTWISE_UNARY_U8toF32(inverse_sigmoid)\n\
+ELTWISE_UNARY_U8toF32(tan)\n\
 \n\
 __kernel void neg_I32toI32\n\
     (\n\
@@ -59180,7 +63484,8 @@ __kernel void grucell_activation_z_h_U8_F32toU8_##act_name( \\\n\
     __read_only  image2d_t        hstate_h_conv, \\\n\
     __write_only image2d_t        output, \\\n\
     __write_only image2d_t        hstate_out, \\\n\
-    float input_scale, float input_tail, float output_scale, float output_zp) \\\n\
+    float input_scale, float input_tail, float output_scale, float output_zp, \\\n\
+    float output_scale1, float output_zp1) \\\n\
 { \\\n\
     int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\
     float4  src0, src1, src2, src3; \\\n\
@@ -59197,10 +63502,12 @@ __kernel void grucell_activation_z_h_U8_F32toU8_##act_name( \\\n\
     z.x = act_func(z.x); \\\n\
     h = tanh_func(h.x); \\\n\
     float4 dst = (1 - z ) * h + z * h_tm; \\\n\
-    dst = dst * output_scale + output_zp; \\\n\
-    uint4 result = convert_uint4_sat_rte(dst); \\\n\
+    float4 out0 = dst * output_scale + output_zp; \\\n\
+    float4 out1 = dst * output_scale1 + output_zp1; \\\n\
+    uint4 result = convert_uint4_sat_rte(out0); \\\n\
+    uint4 result1 = convert_uint4_sat_rte(out1); \\\n\
     write_imageui(output, coord_in.xy, result); \\\n\
-    write_imageui(hstate_out, coord_in.xy, result); \\\n\
+    write_imageui(hstate_out, coord_in.xy, result1); \\\n\
 }\n\
 GRUCELL_ACTIVATION_U8_F32_U8(SIGMOID, sigmoid)\n\
 //GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)\n\
@@ -59214,7 +63521,8 @@ __kernel void grucell_activation_z_h_F32_F32toF32_##act_name( \\\n\
     __read_only  image2d_t        hstate_h_conv, \\\n\
     __write_only image2d_t        output, \\\n\
     __write_only image2d_t        hstate_out, \\\n\
-    float input_scale, float input_tail, float output_scale, float output_zp) \\\n\
+    float input_scale, float input_tail, float output_scale, float output_zp, \\\n\
+    float output_scale1, float output_zp1) \\\n\
 { \\\n\
     int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\
     float4  src0, src1, src2, src3; \\\n\
@@ -59246,7 +63554,8 @@ __kernel void grucell_activation_z_h_I32_F32toI32_##act_name( \\\n\
     __read_only  image2d_t        hstate_h_conv, \\\n\
     __write_only image2d_t        output, \\\n\
     __write_only image2d_t        hstate_out, \\\n\
-    float input_scale, float input_tail, float output_scale, float output_zp) \\\n\
+    float input_scale, float input_tail, float output_scale, float output_zp, \\\n\
+    float output_scale1, float output_zp1) \\\n\
 { \\\n\
     int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\
     float4  src0, src1, src2, src3; \\\n\
@@ -59263,13 +63572,16 @@ __kernel void grucell_activation_z_h_I32_F32toI32_##act_name( \\\n\
     z.x = act_func(z.x); \\\n\
     h = tanh_func(h.x); \\\n\
     float4 dst = (1 - z ) * h + z * h_tm; \\\n\
-    dst = dst * output_scale + output_zp; \\\n\
-    int4 result = convert_int4_sat_rte(dst); \\\n\
+    float4 out0 = dst * output_scale + output_zp; \\\n\
+    float4 out1 = dst * output_scale1 + output_zp1; \\\n\
+    int4 result = convert_int4_sat_rte(out0); \\\n\
+    int4 result1 = convert_int4_sat_rte(out1); \\\n\
     write_imagei(output, coord_in.xy, result); \\\n\
-    write_imagei(hstate_out, coord_in.xy, result); \\\n\
+    write_imagei(hstate_out, coord_in.xy, result1); \\\n\
 }\n\
 GRUCELL_ACTIVATION_I32_F32_I32(SIGMOID, sigmoid)\n\
-//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)"; /* end of grucell_activation_z_h_cl*/
+//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)\n\
+"; /* end of grucell_activation_z_h_cl*/
 
 static const char grucell_h_times_activation_r_cl[] = "#define logE        (1.44269502f)\n\
 #define twoLogE     (logE * 2.0f)\n\
@@ -59382,6 +63694,12 @@ float tanh_func(float x)\n\
     return 2 * x - 1;\n\
 }\n\
 \n\
+float relu_func(float x)\n\
+{\n\
+    x = x > 0 ? x : 0;\n\
+    return x;\n\
+}\n\
+\n\
 \n\
 #define GRUCELL_ACTIVATION_U8_F32_U8(act_name, act_func) \\\n\
 __kernel void grucell_reset_after_activation_U8_F32toU8_##act_name( \\\n\
@@ -59423,6 +63741,7 @@ __kernel void grucell_reset_after_activation_U8_F32toU8_##act_name( \\\n\
 }\n\
 GRUCELL_ACTIVATION_U8_F32_U8(SIGMOID, sigmoid)\n\
 //GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)\n\
+GRUCELL_ACTIVATION_U8_F32_U8(RELU, relu_func)\n\
 \n\
 #define GRUCELL_ACTIVATION_F32_F32_F32(act_name, act_func) \\\n\
 __kernel void grucell_reset_after_activation_F32_F32toF32_##act_name( \\\n\
@@ -59462,6 +63781,7 @@ __kernel void grucell_reset_after_activation_F32_F32toF32_##act_name( \\\n\
 \n\
 GRUCELL_ACTIVATION_F32_F32_F32(SIGMOID, sigmoid)\n\
 //GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)\n\
+GRUCELL_ACTIVATION_F32_F32_F32(RELU, relu_func)\n\
 \n\
 #define GRUCELL_ACTIVATION_I32_F32_I32(act_name, act_func) \\\n\
 __kernel void grucell_reset_after_activation_I32_F32toI32_##act_name( \\\n\
@@ -59502,7 +63822,8 @@ __kernel void grucell_reset_after_activation_I32_F32toI32_##act_name( \\\n\
     write_imagei(hstate_out, coord_in.xy, result); \\\n\
 }\n\
 GRUCELL_ACTIVATION_I32_F32_I32(SIGMOID, sigmoid)\n\
-//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)"; /* end of grucell_reset_after_activation_cl*/
+//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)\n\
+GRUCELL_ACTIVATION_I32_F32_I32(RELU, relu_func)"; /* end of grucell_reset_after_activation_cl*/
 
 static const char hswish_cl[] = "#define HSWISH_F32_F32_PROCESS() \\\n\
     float4 src, tmp, dst; \\\n\
@@ -61968,6 +66289,349 @@ __kernel void log_softmax_axis2_BF16toBF16\n\
 #undef rlogE\n\
 "; /* end of log_softmax_axis2_cl*/
 
+static const char log_softmax_exceed_axis0_cl[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int width;\n\
+_viv_uniform int height;\n\
+\n\
+\n\
+#define rlogE    (0.693147182f)\n\
+float LOG(float x)\n\
+{\n\
+    x = log2(x);\n\
+    return x * rlogE;\n\
+}\n\
+\n\
+__kernel void log_softmax_exceed_axis0_F32toF32(\n\
+    __read_only   image2d_array_t input,\n\
+    __write_only  image2d_array_t output,\n\
+    int   axis, float beta,\n\
+    float scale, float scaleOut, float zpOut)\n\
+{\n\
+    int x = get_global_id(0);\n\
+    int z = get_global_id(1);\n\
+    int4 coord_in = (int4)(0, 0, z, 0);\n\
+    float4 maxValue;\n\
+    float4 src, dst = {0.0};\n\
+\n\
+    maxValue = read_imagef(input, coord_in);\n\
+    for (coord_in.x = 0; coord_in.x < width; coord_in.x++)\n\
+    {\n\
+        for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\
+        {\n\
+            src = read_imagef(input, coord_in);\n\
+            maxValue = maxValue > src ? maxValue : src;\n\
+        }\n\
+    }\n\
+\n\
+    // Compute sum.\n\
+    float sum = 0.f;\n\
+    for (coord_in.x = 0; coord_in.x < width; coord_in.x++)\n\
+    {\n\
+        for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\
+        {\n\
+            src = read_imagef(input, coord_in);\n\
+            sum += exp2((src.x - maxValue.x) * scale);\n\
+        }\n\
+    }\n\
+\n\
+    // Compute result.\n\
+    float logSum = LOG(sum);\n\
+    for (coord_in.x = 0; coord_in.x < width; coord_in.x++)\n\
+    {\n\
+        for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\
+        {\n\
+            src = read_imagef(input, coord_in);\n\
+            dst.x = (src.x - maxValue.x) * beta - logSum;\n\
+            write_imagef(output, coord_in, dst);\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void log_softmax_exceed_axis0_U8toU8(\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+    int   axis, float beta,\n\
+    float scale, float scaleOut, float zpOut)\n\
+{\n\
+    int x = get_global_id(0);\n\
+    int z = get_global_id(1);\n\
+    int4 coord_in = (int4)(0, 0, z, 0);\n\
+    float4 maxValue;\n\
+    float4 src;\n\
+    uint4 dst = {0};\n\
+\n\
+    maxValue = convert_float4(read_imageui(input, coord_in));\n\
+    for (coord_in.x = 0; coord_in.x < width; coord_in.x++)\n\
+    {\n\
+        for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\
+        {\n\
+            src = convert_float4(read_imageui(input, coord_in));\n\
+            maxValue = maxValue > src ? maxValue : src;\n\
+        }\n\
+    }\n\
+\n\
+    // Compute sum.\n\
+    float sum = 0.f;\n\
+    for (coord_in.x = 0; coord_in.x < width; coord_in.x++)\n\
+    {\n\
+        for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\
+        {\n\
+            src = convert_float4(read_imageui(input, coord_in));\n\
+            sum += exp2((src.x - maxValue.x) * scale);\n\
+        }\n\
+    }\n\
+\n\
+    // Compute result.\n\
+    float logSum = LOG(sum);\n\
+    for (coord_in.x = 0; coord_in.x < width; coord_in.x++)\n\
+    {\n\
+        for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\
+        {\n\
+            src = convert_float4(read_imageui(input, coord_in));\n\
+\n\
+            dst.x = convert_uint(((src.x - maxValue.x) * beta - logSum) * scaleOut + zpOut);\n\
+\n\
+            write_imageui(output, coord_in, dst);\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+\n\
+__kernel void log_softmax_exceed_axis0_BF16toBF16(\n\
+    __read_only   image2d_array_t input,\n\
+    __write_only  image2d_array_t output,\n\
+    int   axis, float beta,\n\
+    float scale, float scaleOut, float zpOut)\n\
+{\n\
+    int x = get_global_id(0);\n\
+    int z = get_global_id(1);\n\
+    int4 coord_in = (int4)(0, 0, z, 0);\n\
+    float4 maxValue, src, dst = {0.0};\n\
+    uint4 data, val, out;\n\
+\n\
+    data = read_imageui(input, coord_in);\n\
+    data = data << 16;\n\
+    _viv_asm(COPY, maxValue, data, 16);\n\
+    for (coord_in.x = 0; coord_in.x < width; coord_in.x++)\n\
+    {\n\
+        for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\
+        {\n\
+            data = read_imageui(input, coord_in);\n\
+            data = data << 16;\n\
+            _viv_asm(COPY, src, data, 16);\n\
+\n\
+            maxValue = maxValue > src ? maxValue : src;\n\
+        }\n\
+    }\n\
+\n\
+    float sum = 0.f;\n\
+    for (coord_in.x = 0; coord_in.x < width; coord_in.x++)\n\
+    {\n\
+        for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\
+        {\n\
+            data = read_imageui(input, coord_in);\n\
+            data = data << 16;\n\
+            _viv_asm(COPY, src, data, 16);\n\
+\n\
+            sum += exp2((src.x - maxValue.x) * scale);\n\
+        }\n\
+    }\n\
+\n\
+    float logSum = LOG(sum);\n\
+    for (coord_in.x = 0; coord_in.x < width; coord_in.x++)\n\
+    {\n\
+        for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\
+        {\n\
+            data = read_imageui(input, coord_in);\n\
+            data = data << 16;\n\
+            _viv_asm(COPY, src, data, 16);\n\
+\n\
+            dst.x = (src.x - maxValue.x) * beta - logSum;\n\
+            _viv_asm(COPY, val, dst, 16);\n\
+            out = val >> 16;\n\
+            write_imageui(output, coord_in, out);\n\
+        }\n\
+    }\n\
+}\n\
+#undef rlogE\n\
+"; /* end of log_softmax_exceed_axis0_cl*/
+
+static const char log_softmax_exceed_axis1_cl[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int height;\n\
+_viv_uniform int depth;\n\
+\n\
+#define rlogE    (0.693147182f)\n\
+\n\
+float LOG(float x)\n\
+{\n\
+    x = log2(x);\n\
+    return x * rlogE;\n\
+}\n\
+\n\
+__kernel void log_softmax_exceed_axis1_F32toF32(\n\
+    __read_only   image2d_array_t input,\n\
+    __write_only  image2d_array_t output,\n\
+    int axis, float beta,\n\
+    float scale, float scaleOut, float zpOut)\n\
+{\n\
+    int x = get_global_id(0);\n\
+    int y = get_global_id(1);\n\
+    int4 coord_in = (int4)(x, 0, 0, 0);\n\
+    float4 maxValue;\n\
+    float4 src, dst = {0.0};\n\
+\n\
+    maxValue = read_imagef(input, coord_in);\n\
+    for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\
+    {\n\
+        for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)\n\
+        {\n\
+            src = read_imagef(input, coord_in);\n\
+            maxValue = maxValue > src ? maxValue : src;\n\
+        }\n\
+    }\n\
+\n\
+    // Compute sum.\n\
+    float sum = 0.f;\n\
+    for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\
+    {\n\
+        for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)\n\
+        {\n\
+            src = read_imagef(input, coord_in);\n\
+            sum += exp2((src.x - maxValue.x) * scale);\n\
+        }\n\
+    }\n\
+\n\
+    // Compute result.\n\
+    float logSum = LOG(sum);\n\
+    for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\
+    {\n\
+        for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)\n\
+        {\n\
+            src = read_imagef(input, coord_in);\n\
+\n\
+            dst.x = (src.x - maxValue.x) * beta - logSum;\n\
+            write_imagef(output, coord_in, dst);\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void log_softmax_exceed_axis1_U8toU8(\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+    int axis, float beta,\n\
+    float scale, float scaleOut, float zpOut)\n\
+{\n\
+    int x = get_global_id(0);\n\
+    int y = get_global_id(1);\n\
+    int4 coord_in = (int4)(x, 0, 0, 0);\n\
+    float4 maxValue;\n\
+    float4 src;\n\
+    uint4 dst = {0};\n\
+\n\
+    maxValue = convert_float4(read_imageui(input, coord_in));\n\
+    for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\
+    {\n\
+        for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)\n\
+        {\n\
+            src = convert_float4(read_imageui(input, coord_in));\n\
+\n\
+            maxValue = maxValue > src ? maxValue : src;\n\
+        }\n\
+    }\n\
+\n\
+    // Compute sum.\n\
+    float sum = 0.f;\n\
+    for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\
+    {\n\
+        for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)\n\
+        {\n\
+            src = convert_float4(read_imageui(input, coord_in));\n\
+\n\
+            sum += exp2((src.x - maxValue.x) * scale);\n\
+        }\n\
+    }\n\
+\n\
+    // Compute result.\n\
+    float logSum = LOG(sum);\n\
+    for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\
+    {\n\
+        for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)\n\
+        {\n\
+            src = convert_float4(read_imageui(input, coord_in));\n\
+\n\
+            dst.x = convert_uint(((src.x - maxValue.x) * beta - logSum) * scaleOut + zpOut);\n\
+\n\
+            write_imageui(output, coord_in, dst);\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void log_softmax_exceed_axis1_BF16oBF16(\n\
+    __read_only   image2d_array_t input,\n\
+    __write_only  image2d_array_t output,\n\
+        int axis, float beta,\n\
+        float scale, float scaleOut, float zpOut)\n\
+{\n\
+    int x = get_global_id(0);\n\
+    int y = get_global_id(1);\n\
+    int4 coord_in = (int4)(x, 0, 0, 0);\n\
+    float4 maxValue, src, dst = {0.0};\n\
+    uint4 data, val, out;\n\
+\n\
+    data = read_imageui(input, coord_in);\n\
+    data = data << 16;\n\
+    _viv_asm(COPY, maxValue, data, 16);\n\
+    for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\
+    {\n\
+        for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)\n\
+        {\n\
+            data = read_imageui(input, coord_in);\n\
+            data = data << 16;\n\
+            _viv_asm(COPY, src, data, 16);\n\
+\n\
+            maxValue = maxValue > src ? maxValue : src;\n\
+        }\n\
+    }\n\
+\n\
+    float sum = 0.f;\n\
+    for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\
+    {\n\
+        for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)\n\
+        {\n\
+            data = read_imageui(input, coord_in);\n\
+            data = data << 16;\n\
+            _viv_asm(COPY, src, data, 16);\n\
+\n\
+            sum += exp2((src.x - maxValue.x) * scale);\n\
+        }\n\
+    }\n\
+\n\
+    float logSum = LOG(sum);\n\
+    for (coord_in.y = 0; coord_in.y < height; coord_in.y++)\n\
+    {\n\
+        for (coord_in.z = 0; coord_in.z < depth; coord_in.z++)\n\
+        {\n\
+            data = read_imageui(input, coord_in);\n\
+            data = data << 16;\n\
+            _viv_asm(COPY, src, data, 16);\n\
+\n\
+            dst.x = (src.x - maxValue.x) * beta - logSum;\n\
+\n\
+            _viv_asm(COPY, val, dst, 16);\n\
+            out = val >> 16;\n\
+\n\
+            write_imageui(output, coord_in, out);\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+#undef rlogE\n\
+"; /* end of log_softmax_exceed_axis1_cl*/
+
 static const char logical_not_cl[] = "__kernel void logical_not_I8toI8(\n\
     __read_only image2d_array_t   input,\n\
     __write_only image2d_array_t  output)\n\
@@ -65593,7 +70257,135 @@ __kernel void gemm_4x_transa_F32F32toF32_2D(\n\
 \n\
 }\n\
 \n\
+__kernel __attribute__((reqd_work_group_size(1, 64, 1)))\n\
+    void gemm_4x_transa_local_F32F32toF32_2D(\n\
+    __read_only  image2d_t inputA,\n\
+    __read_only  image2d_t inputB,\n\
+    __write_only image2d_t output,\n\
+    int M,\n\
+    int K,\n\
+    int N,\n\
+    int ac2zero,\n\
+    int bc2zero,\n\
+    float scale_a,\n\
+    float zp_a,\n\
+    float scale_b,\n\
+    float zp_b,\n\
+    float scale_out,\n\
+    float zp_out\n\
+    )\n\
+{\n\
+    int offset0 = get_global_id(0);\n\
+    int lid = get_local_id(1);\n\
 \n\
+    int stride = 0;\n\
+\n\
+    int z = 0;\n\
+    int offset1 = M << 2;\n\
+    int step = K >> 8;\n\
+    int lid2 = lid * 4 * step;\n\
+\n\
+    Image in0_tensor = create_image_from_image2d(inputA, 4);\n\
+    __global float* in0_ptr0 = (__global float*)in0_tensor.ptr + offset0 + lid2 * M;\n\
+    __global float* in0_ptr1 = in0_ptr0 + M;\n\
+    __global float* in0_ptr2 = in0_ptr1 + M;\n\
+    __global float* in0_ptr3 = in0_ptr2 + M;\n\
+\n\
+    Image in1_tensor = create_image_from_image2d(inputB, 4);\n\
+    __global float* in1_ptr = (__global float*)in1_tensor.ptr + lid2;\n\
+\n\
+    Image o_tensor = create_image_from_image2d(output, 4);\n\
+    __global float* output_ptr = (__global float*)o_tensor.ptr + offset0;\n\
+\n\
+    __local float4 sum_vec4_0[64];\n\
+    __local float4 sum_vec4_1[64];\n\
+    __local float4 sum_vec4_2[64];\n\
+    __local float4 sum_vec4_3[64];\n\
+\n\
+    float4 sum0 = (float4)(0.0, 0.0, 0.0, 0.0);\n\
+    float4 sum1 = (float4)(0.0, 0.0, 0.0, 0.0);\n\
+    float4 sum2 = (float4)(0.0, 0.0, 0.0, 0.0);\n\
+    float4 sum3 = (float4)(0.0, 0.0, 0.0, 0.0);\n\
+\n\
+    float4 tempA0, tempA1, tempA2, tempA3;\n\
+    float4 tempA4, tempA5, tempA6, tempA7;\n\
+    float4 tempB0;\n\
+\n\
+    for(z = 0; z < step; z++)\n\
+    {\n\
+        tempB0 = vload4(z, in1_ptr);\n\
+        tempA0 = vload4(0, in0_ptr0);\n\
+        tempA1 = vload4(0, in0_ptr1);\n\
+        tempA2 = vload4(0, in0_ptr2);\n\
+        tempA3 = vload4(0, in0_ptr3);\n\
+        tempA4 = vload4(1, in0_ptr0);\n\
+        tempA5 = vload4(1, in0_ptr1);\n\
+        tempA6 = vload4(1, in0_ptr2);\n\
+        tempA7 = vload4(1, in0_ptr3);\n\
+\n\
+        sum0 = sum0 + tempA0 * tempB0.x;\n\
+        sum0 = sum0 + tempA1 * tempB0.y;\n\
+        sum0 = sum0 + tempA2 * tempB0.z;\n\
+        sum0 = sum0 + tempA3 * tempB0.w;\n\
+        sum1 = sum1 + tempA4 * tempB0.x;\n\
+        sum1 = sum1 + tempA5 * tempB0.y;\n\
+        sum1 = sum1 + tempA6 * tempB0.z;\n\
+        sum1 = sum1 + tempA7 * tempB0.w;\n\
+\n\
+        tempA0 = vload4(2, in0_ptr0);\n\
+        tempA1 = vload4(2, in0_ptr1);\n\
+        tempA2 = vload4(2, in0_ptr2);\n\
+        tempA3 = vload4(2, in0_ptr3);\n\
+        tempA4 = vload4(3, in0_ptr0);\n\
+        tempA5 = vload4(3, in0_ptr1);\n\
+        tempA6 = vload4(3, in0_ptr2);\n\
+        tempA7 = vload4(3, in0_ptr3);\n\
+\n\
+        in0_ptr0 = in0_ptr0 + offset1;\n\
+        in0_ptr1 = in0_ptr1 + offset1;\n\
+        in0_ptr2 = in0_ptr2 + offset1;\n\
+        in0_ptr3 = in0_ptr3 + offset1;\n\
+\n\
+        sum2 = sum2 + tempA0 * tempB0.x;\n\
+        sum2 = sum2 + tempA1 * tempB0.y;\n\
+        sum2 = sum2 + tempA2 * tempB0.z;\n\
+        sum2 = sum2 + tempA3 * tempB0.w;\n\
+        sum3 = sum3 + tempA4 * tempB0.x;\n\
+        sum3 = sum3 + tempA5 * tempB0.y;\n\
+        sum3 = sum3 + tempA6 * tempB0.z;\n\
+        sum3 = sum3 + tempA7 * tempB0.w;\n\
+    }\n\
+    sum_vec4_0[lid] = sum0;\n\
+    sum_vec4_1[lid] = sum1;\n\
+    sum_vec4_2[lid] = sum2;\n\
+    sum_vec4_3[lid] = sum3;\n\
+\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    for (stride = 32; stride > 0; stride >>= 1)\n\
+    {\n\
+        if (lid < stride)\n\
+        {\n\
+            sum_vec4_0[lid] += sum_vec4_0[lid + stride];\n\
+            sum_vec4_1[lid] += sum_vec4_1[lid + stride];\n\
+            sum_vec4_2[lid] += sum_vec4_2[lid + stride];\n\
+            sum_vec4_3[lid] += sum_vec4_3[lid + stride];\n\
+        }\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
+    }\n\
+\n\
+    if (lid == 0)\n\
+    {\n\
+        sum0 = sum_vec4_0[0];\n\
+        sum1 = sum_vec4_1[0];\n\
+        sum2 = sum_vec4_2[0];\n\
+        sum3 = sum_vec4_3[0];\n\
+        vstore4(sum0, 0, output_ptr);\n\
+        vstore4(sum1, 1, output_ptr);\n\
+        vstore4(sum2, 2, output_ptr);\n\
+        vstore4(sum3, 3, output_ptr);\n\
+    }\n\
+}\n\
 \n\
 "; /* end of matrixmul_4x_cl*/
 
@@ -71548,6 +76340,203 @@ __kernel void resize_bilinear_U8toU8(\n\
 }\n\
 "; /* end of resize_bilinear_cl*/
 
+static const char resize_cubic_cl[] = "__kernel void resize_cubic_F32toF32(\n\
+    __read_only  image2d_array_t  input,\n\
+    __write_only image2d_array_t  output,\n\
+                           float  scale_x,\n\
+                           float  scale_y,\n\
+                           float  half_pixel_value\n\
+                           )\n\
+{\n\
+    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    float  cubic_coeffs_y[4] = {0,0,0,0};\n\
+    float  cubic_coeffs_x[4] = {0,0,0,0};\n\
+    float  in_x         = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value;\n\
+    float  left_x_f     = floor(in_x);\n\
+    float4 delta_x      = (float4)(0, in_x - left_x_f,0,0);\n\
+    float  in_y         = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value;\n\
+    float  top_y_f      = floor(in_y);\n\
+    float4 delta_y      = (float4)(0, in_y - top_y_f,0,0);\n\
+    int    x_idx        = convert_int(left_x_f - 1);\n\
+    int    y_idx        = convert_int(top_y_f - 1);\n\
+    int4   coord_in     = (int4)(x_idx, y_idx, coord_out.z, 0);\n\
+    float  data00, data01, data02, data03, data10, data11, data12, data13,\n\
+           data20, data21, data22, data23, data30, data31, data32, data33;\n\
+\n\
+    delta_x.x = 1 + delta_x.y;\n\
+    delta_x.z = 1 - delta_x.y;\n\
+    delta_x.w = 2 - delta_x.y;\n\
+    cubic_coeffs_x[0] = -0.5 * ((((delta_x.x - 5) * delta_x.x + 8) * delta_x.x) - 4);\n\
+    cubic_coeffs_x[1] = (1.5 * delta_x.y - 2.5) * delta_x.y * delta_x.y + 1;\n\
+    cubic_coeffs_x[2] = (1.5 * delta_x.z - 2.5) * delta_x.z * delta_x.z + 1;\n\
+    cubic_coeffs_x[3] = -0.5 * ((((delta_x.w - 5) * delta_x.w + 8) * delta_x.w) - 4);\n\
+    delta_y.x = 1 + delta_y.y;\n\
+    delta_y.z = 1 - delta_y.y;\n\
+    delta_y.w = 2 - delta_y.y;\n\
+    cubic_coeffs_y[0] = -0.5 * ((((delta_y.x - 5) * delta_y.x + 8) * delta_y.x) - 4);\n\
+    cubic_coeffs_y[1] = (1.5 * delta_y.y - 2.5) * delta_y.y * delta_y.y + 1;\n\
+    cubic_coeffs_y[2] = (1.5 * delta_y.z - 2.5) * delta_y.z * delta_y.z + 1;\n\
+    cubic_coeffs_y[3] = -0.5 * ((((delta_y.w - 5) * delta_y.w + 8) * delta_y.w) - 4);\n\
+    float4 dst = (float4)(0,0,0,0);\n\
+\n\
+    data00   = read_imagef(input, coord_in).x;\n\
+    coord_in.x++;\n\
+    data10   = read_imagef(input, coord_in).x;\n\
+    coord_in.x++;\n\
+    data20   = read_imagef(input, coord_in).x;\n\
+    coord_in.x++;\n\
+    data30   = read_imagef(input, coord_in).x;\n\
+\n\
+    coord_in.y++;\n\
+    data31   = read_imagef(input, coord_in).x;\n\
+    coord_in.x--;\n\
+    data21   = read_imagef(input, coord_in).x;\n\
+    coord_in.x--;\n\
+    data11   = read_imagef(input, coord_in).x;\n\
+    coord_in.x--;\n\
+    data01   = read_imagef(input, coord_in).x;\n\
+\n\
+    coord_in.y++;\n\
+    data02   = read_imagef(input, coord_in).x;\n\
+    coord_in.x++;\n\
+    data12   = read_imagef(input, coord_in).x;\n\
+    coord_in.x++;\n\
+    data22   = read_imagef(input, coord_in).x;\n\
+    coord_in.x++;\n\
+    data32   = read_imagef(input, coord_in).x;\n\
+\n\
+    coord_in.y++;\n\
+    data33   = read_imagef(input, coord_in).x;\n\
+    coord_in.x--;\n\
+    data23   = read_imagef(input, coord_in).x;\n\
+    coord_in.x--;\n\
+    data13   = read_imagef(input, coord_in).x;\n\
+    coord_in.x--;\n\
+    data03   = read_imagef(input, coord_in).x;\n\
+\n\
+    dst.x = data00 * cubic_coeffs_x[0] * cubic_coeffs_y[0]\n\
+          + data01 * cubic_coeffs_x[0] * cubic_coeffs_y[1]\n\
+          + data02 * cubic_coeffs_x[0] * cubic_coeffs_y[2]\n\
+          + data03 * cubic_coeffs_x[0] * cubic_coeffs_y[3]\n\
+          + data10 * cubic_coeffs_x[1] * cubic_coeffs_y[0]\n\
+          + data11 * cubic_coeffs_x[1] * cubic_coeffs_y[1]\n\
+          + data12 * cubic_coeffs_x[1] * cubic_coeffs_y[2]\n\
+          + data13 * cubic_coeffs_x[1] * cubic_coeffs_y[3]\n\
+          + data20 * cubic_coeffs_x[2] * cubic_coeffs_y[0]\n\
+          + data21 * cubic_coeffs_x[2] * cubic_coeffs_y[1]\n\
+          + data22 * cubic_coeffs_x[2] * cubic_coeffs_y[2]\n\
+          + data23 * cubic_coeffs_x[2] * cubic_coeffs_y[3]\n\
+          + data30 * cubic_coeffs_x[3] * cubic_coeffs_y[0]\n\
+          + data31 * cubic_coeffs_x[3] * cubic_coeffs_y[1]\n\
+          + data32 * cubic_coeffs_x[3] * cubic_coeffs_y[2]\n\
+          + data33 * cubic_coeffs_x[3] * cubic_coeffs_y[3];\n\
+\n\
+    write_imagef(output, coord_out, dst);\n\
+\n\
+}\n\
+\n\
+\n\
+__kernel void resize_cubic_U8toU8(\n\
+    __read_only  image2d_array_t  input,\n\
+    __write_only image2d_array_t  output,\n\
+                           float  scale_x,\n\
+                           float  scale_y,\n\
+                           float  half_pixel_value,\n\
+                           float  in_scale,\n\
+                           float  in_tail,\n\
+                           float  out_scale,\n\
+                           float  out_tail\n\
+                           )\n\
+{\n\
+    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    float  cubic_coeffs_y[4] = {0,0,0,0};\n\
+    float  cubic_coeffs_x[4] = {0,0,0,0};\n\
+    float  in_x         = (convert_float(coord_out.x) + half_pixel_value) * scale_x - half_pixel_value;\n\
+    float  left_x_f     = floor(in_x);\n\
+    float4 delta_x      = (float4)(0, in_x - left_x_f,0,0);\n\
+    float  in_y         = (convert_float(coord_out.y) + half_pixel_value) * scale_y - half_pixel_value;\n\
+    float  top_y_f      = floor(in_y);\n\
+    float4 delta_y      = (float4)(0, in_y - top_y_f,0,0);\n\
+    int    x_idx        = convert_int(left_x_f - 1);\n\
+    int    y_idx        = convert_int(top_y_f - 1);\n\
+    int4   coord_in     = (int4)(x_idx, y_idx, coord_out.z, 0);\n\
+    float  data00, data01, data02, data03, data10, data11, data12, data13,\n\
+           data20, data21, data22, data23, data30, data31, data32, data33;\n\
+\n\
+    delta_x.x = 1 + delta_x.y;\n\
+    delta_x.z = 1 - delta_x.y;\n\
+    delta_x.w = 2 - delta_x.y;\n\
+    cubic_coeffs_x[0] = -0.5 * ((((delta_x.x - 5) * delta_x.x + 8) * delta_x.x) - 4);\n\
+    cubic_coeffs_x[1] = (1.5 * delta_x.y - 2.5) * delta_x.y * delta_x.y + 1;\n\
+    cubic_coeffs_x[2] = (1.5 * delta_x.z - 2.5) * delta_x.z * delta_x.z + 1;\n\
+    cubic_coeffs_x[3] = -0.5 * ((((delta_x.w - 5) * delta_x.w + 8) * delta_x.w) - 4);\n\
+    delta_y.x = 1 + delta_y.y;\n\
+    delta_y.z = 1 - delta_y.y;\n\
+    delta_y.w = 2 - delta_y.y;\n\
+    cubic_coeffs_y[0] = -0.5 * ((((delta_y.x - 5) * delta_y.x + 8) * delta_y.x) - 4);\n\
+    cubic_coeffs_y[1] = (1.5 * delta_y.y - 2.5) * delta_y.y * delta_y.y + 1;\n\
+    cubic_coeffs_y[2] = (1.5 * delta_y.z - 2.5) * delta_y.z * delta_y.z + 1;\n\
+    cubic_coeffs_y[3] = -0.5 * ((((delta_y.w - 5) * delta_y.w + 8) * delta_y.w) - 4);\n\
+    float dst = 0;\n\
+    uint4 out = (uint4)(0,0,0,0);\n\
+\n\
+    data00   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\
+    coord_in.x++;\n\
+    data10   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\
+    coord_in.x++;\n\
+    data20   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\
+    coord_in.x++;\n\
+    data30   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\
+\n\
+    coord_in.y++;\n\
+    data31   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\
+    coord_in.x--;\n\
+    data21   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\
+    coord_in.x--;\n\
+    data11   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\
+    coord_in.x--;\n\
+    data01   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\
+\n\
+    coord_in.y++;\n\
+    data02   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\
+    coord_in.x++;\n\
+    data12   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\
+    coord_in.x++;\n\
+    data22   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\
+    coord_in.x++;\n\
+    data32   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\
+\n\
+    coord_in.y++;\n\
+    data33   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\
+    coord_in.x--;\n\
+    data23   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\
+    coord_in.x--;\n\
+    data13   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\
+    coord_in.x--;\n\
+    data03   = convert_float(read_imageui(input, coord_in).x) * in_scale + in_tail;\n\
+\n\
+    dst = data00 * cubic_coeffs_x[0] * cubic_coeffs_y[0]\n\
+        + data01 * cubic_coeffs_x[0] * cubic_coeffs_y[1]\n\
+        + data02 * cubic_coeffs_x[0] * cubic_coeffs_y[2]\n\
+        + data03 * cubic_coeffs_x[0] * cubic_coeffs_y[3]\n\
+        + data10 * cubic_coeffs_x[1] * cubic_coeffs_y[0]\n\
+        + data11 * cubic_coeffs_x[1] * cubic_coeffs_y[1]\n\
+        + data12 * cubic_coeffs_x[1] * cubic_coeffs_y[2]\n\
+        + data13 * cubic_coeffs_x[1] * cubic_coeffs_y[3]\n\
+        + data20 * cubic_coeffs_x[2] * cubic_coeffs_y[0]\n\
+        + data21 * cubic_coeffs_x[2] * cubic_coeffs_y[1]\n\
+        + data22 * cubic_coeffs_x[2] * cubic_coeffs_y[2]\n\
+        + data23 * cubic_coeffs_x[2] * cubic_coeffs_y[3]\n\
+        + data30 * cubic_coeffs_x[3] * cubic_coeffs_y[0]\n\
+        + data31 * cubic_coeffs_x[3] * cubic_coeffs_y[1]\n\
+        + data32 * cubic_coeffs_x[3] * cubic_coeffs_y[2]\n\
+        + data33 * cubic_coeffs_x[3] * cubic_coeffs_y[3];\n\
+    out.x = convert_uint(dst * out_scale + out_tail);\n\
+\n\
+    write_imageui(output, coord_out, out);\n\
+}\n\
+"; /* end of resize_cubic_cl*/
+
 static const char resize_nearest_cl[] = "\n\
 #define NEAREST_INDEX_PROCESS() \\\n\
     int4   coord_out  = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
@@ -73239,6 +78228,284 @@ SCATTER_ND_UPDATE(I32,  int4,   read_imagei,  write_imagei)\n\
 SCATTER_ND_UPDATE(F32,  float4, read_imagef,  write_imagef)\n\
 "; /* end of scatter_nd_update_cl*/
 
+static const char scatter_nd_update_reduction_cl[] = "\n\
+inline void AtomicAdd_float(volatile __global float *source, const float operand)\n\
+{\n\
+    union\n\
+    {\n\
+        unsigned int intVal;\n\
+        float floatVal;\n\
+    } newVal;\n\
+    union\n\
+    {\n\
+        unsigned int intVal;\n\
+        float floatVal;\n\
+    } prevVal;\n\
+    do\n\
+    {\n\
+        prevVal.floatVal = *source;\n\
+        newVal.floatVal = prevVal.floatVal + operand;\n\
+    } while(atomic_cmpxchg((volatile __global unsigned int *)source,\n\
+                             prevVal.intVal, newVal.intVal) != prevVal.intVal);\n\
+}\n\
+\n\
+inline void AtomicMul_float(volatile __global float *source, const float operand)\n\
+{\n\
+    union\n\
+    {\n\
+        unsigned int intVal;\n\
+        float floatVal;\n\
+    } newVal;\n\
+    union\n\
+    {\n\
+        unsigned int intVal;\n\
+        float floatVal;\n\
+    } prevVal;\n\
+    do\n\
+    {\n\
+        prevVal.floatVal = *source;\n\
+        newVal.floatVal = prevVal.floatVal * operand;\n\
+    } while(atomic_cmpxchg((volatile __global unsigned int *)source,\n\
+                             prevVal.intVal, newVal.intVal) != prevVal.intVal);\n\
+}\n\
+\n\
+inline void AtomicMax_float(volatile __global float *source, const float operand)\n\
+{\n\
+    union\n\
+    {\n\
+        unsigned int intVal;\n\
+        float floatVal;\n\
+    } newVal;\n\
+    union\n\
+    {\n\
+        unsigned int intVal;\n\
+        float floatVal;\n\
+    } prevVal;\n\
+    do\n\
+    {\n\
+        prevVal.floatVal = *source;\n\
+        newVal.floatVal = fmax(prevVal.floatVal, operand);\n\
+    } while(atomic_cmpxchg((volatile __global unsigned int *)source,\n\
+                             prevVal.intVal, newVal.intVal) != prevVal.intVal);\n\
+}\n\
+\n\
+inline void AtomicMin_float(volatile __global float *source, const float operand)\n\
+{\n\
+    union\n\
+    {\n\
+        unsigned int intVal;\n\
+        float floatVal;\n\
+    } newVal;\n\
+    union\n\
+    {\n\
+        unsigned int intVal;\n\
+        float floatVal;\n\
+    } prevVal;\n\
+    do\n\
+    {\n\
+        prevVal.floatVal = *source;\n\
+        newVal.floatVal = fmin(prevVal.floatVal, operand);\n\
+    } while(atomic_cmpxchg((volatile __global unsigned int *)source,\n\
+                             prevVal.intVal, newVal.intVal) != prevVal.intVal);\n\
+}\n\
+\n\
+#define SCATTER_REDUCTION_PREPROCESS(name0, ptr0, type0, size0, ptr2) \\\n\
+__kernel void scatter_nd_update_reduction_preprocess_##name0( \\\n\
+    __read_only image2d_t   input_ref, \\\n\
+    image2d_t  temp_buf_float, \\\n\
+    int length, int res, float input_scale, float zp_scale) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    Image img1 = create_image_from_image2d(input_ref, size0); \\\n\
+    Image img2 = create_image_from_image2d(temp_buf_float, 4); \\\n\
+    __global float* tmp_ref_ptr = (__global float*)img2.ptr; \\\n\
+    type0 src0, src1; \\\n\
+    float4 tmpDst0, tmpDst1; \\\n\
+    __global ptr2* input_ptr = (__global ptr2*)img1.ptr; \\\n\
+    if(length > 0) \\\n\
+    { \\\n\
+        int loc2 = gidx * 8; \\\n\
+        ptr0 tmpData0 = vload4(0, input_ptr + loc2); \\\n\
+        ptr0 tmpData1 = vload4(1, input_ptr + loc2); \\\n\
+        _viv_asm(COPY, src0, tmpData0, 16); \\\n\
+        _viv_asm(COPY, src1, tmpData1, 16); \\\n\
+        _viv_asm(CONV, tmpDst0, src0); \\\n\
+        _viv_asm(CONV, tmpDst1, src1); \\\n\
+        tmpDst0 = tmpDst0 * input_scale + zp_scale; \\\n\
+        tmpDst1 = tmpDst1 * input_scale + zp_scale; \\\n\
+        vstore4(tmpDst0, 0, tmp_ref_ptr + loc2); \\\n\
+        vstore4(tmpDst1, 1, tmp_ref_ptr + loc2); \\\n\
+    } \\\n\
+    for(int i = gidx; i < res; i += get_global_size(0)) \\\n\
+    { \\\n\
+        ptr2 tmpData0 = input_ptr[length + i]; \\\n\
+        _viv_asm(COPY, src0, tmpData0, 4); \\\n\
+        _viv_asm(CONV, tmpDst0, src0); \\\n\
+        tmpDst0.x = tmpDst0.x * input_scale + zp_scale; \\\n\
+        tmp_ref_ptr[length + i] = tmpDst0.x; \\\n\
+    } \\\n\
+}\n\
+SCATTER_REDUCTION_PREPROCESS(U8,  uchar4, uchar4, 1, uchar)\n\
+SCATTER_REDUCTION_PREPROCESS(I8,  char4,  char4,  1, char)\n\
+SCATTER_REDUCTION_PREPROCESS(I16, short4, short4, 2, short)\n\
+SCATTER_REDUCTION_PREPROCESS(F16, short4, half4,  2, short)\n\
+SCATTER_REDUCTION_PREPROCESS(F32, float4, float4, 4, float)\n\
+\n\
+#define SCATTER_ND_REDUCTION_PROCESS_F16(name0, func) \\\n\
+__kernel void scatter_nd_update_reduction_##name0##_F16( \\\n\
+    __read_only image2d_t   index, \\\n\
+    __read_only image2d_t   update, \\\n\
+    image2d_t  temp_buf_float, \\\n\
+    image2d_t  link_buffer0, \\\n\
+    int val0, int val1, int val2, int val3, int val4, int val5, int val6, \\\n\
+    int coord_dim, int update_width, int output_width, float update_scale, float zp_scale) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    Image img1 = create_image_from_image2d(index, 4); \\\n\
+    Image img2 = create_image_from_image2d(update, 2); \\\n\
+    Image img3 = create_image_from_image2d(temp_buf_float, 4); \\\n\
+    __global int* index_ptr = (__global int*)img1.ptr; \\\n\
+    __global short* update_ptr = (__global short*)img2.ptr; \\\n\
+    __global float* output_ptr = (__global float*)img3.ptr; \\\n\
+    half src; \\\n\
+ \\\n\
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\
+    short tmpData = update_ptr[gidy * update_width + gidx]; \\\n\
+    int idx = indice.x * val0 + indice.y * val1 + indice.z * val2 + indice.w * val3; \\\n\
+    idx = idx + indice1.x * val4 + indice1.y * val5 + indice1.z * val6; \\\n\
+    int loc = idx * output_width + gidx; \\\n\
+    _viv_asm(COPY, src, tmpData, 4); \\\n\
+    float data; \\\n\
+    _viv_asm(CONV, data, src); \\\n\
+    func(output_ptr + loc, data); \\\n\
+}\n\
+SCATTER_ND_REDUCTION_PROCESS_F16(Add,  AtomicAdd_float)\n\
+SCATTER_ND_REDUCTION_PROCESS_F16(Mul,  AtomicMul_float)\n\
+SCATTER_ND_REDUCTION_PROCESS_F16(Max,  AtomicMax_float)\n\
+SCATTER_ND_REDUCTION_PROCESS_F16(Min,  AtomicMin_float)\n\
+\n\
+#define SCATTER_ND_UPDATE_PROCESS_QINT(name0, src0_type, ptr_type, element_size, func) \\\n\
+__kernel void scatter_nd_update_reduction_##name0##_##src0_type( \\\n\
+    __read_only image2d_t   index, \\\n\
+    __read_only image2d_t   update, \\\n\
+    image2d_t  temp_buf_float, \\\n\
+    image2d_t  link_buffer0, \\\n\
+    int val0, int val1, int val2, int val3, int val4, int val5, int val6, \\\n\
+    int coord_dim, int update_width, int output_width, float update_scale, float zp_scale) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    Image img1 = create_image_from_image2d(index, 4); \\\n\
+    Image img2 = create_image_from_image2d(update, element_size); \\\n\
+    Image img3 = create_image_from_image2d(temp_buf_float, 4); \\\n\
+    __global int* index_ptr = (__global int*)img1.ptr; \\\n\
+    __global ptr_type* update_ptr = (__global ptr_type*)img2.ptr; \\\n\
+    __global float* output_ptr = (__global float*)img3.ptr; \\\n\
+ \\\n\
+    int4 indice = vload4(0, index_ptr + gidy * coord_dim); \\\n\
+    int4 indice1 = coord_dim < 5 ? (int4)(0) : vload4(1, index_ptr + gidy * coord_dim); \\\n\
+    ptr_type tmpData = update_ptr[gidy * update_width + gidx]; \\\n\
+    int idx = indice.x * val0 + indice.y * val1 + indice.z * val2 + indice.w * val3; \\\n\
+    idx = idx + indice1.x * val4 + indice1.y * val5 + indice1.z * val6; \\\n\
+    int loc = idx * output_width + gidx; \\\n\
+    float data; \\\n\
+    _viv_asm(CONV, data, tmpData); \\\n\
+    data = data * update_scale + zp_scale; \\\n\
+    func(output_ptr + loc, data); \\\n\
+}\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Add, U8,  uchar, 1, AtomicAdd_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Mul, U8,  uchar, 1, AtomicMul_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Max, U8,  uchar, 1, AtomicMax_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Min, U8,  uchar, 1, AtomicMin_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Add, I8,  char,  1, AtomicAdd_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Mul, I8,  char,  1, AtomicMul_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Max, I8,  char,  1, AtomicMax_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Min, I8,  char,  1, AtomicMin_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Add, I16, short, 2, AtomicAdd_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Mul, I16, short, 2, AtomicMul_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Max, I16, short, 2, AtomicMax_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Min, I16, short, 2, AtomicMin_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Add, F32, float, 4, AtomicAdd_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Mul, F32, float, 4, AtomicMul_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Max, F32, float, 4, AtomicMax_float)\n\
+SCATTER_ND_UPDATE_PROCESS_QINT(Min, F32, float, 4, AtomicMin_float)"; /* end of scatter_nd_update_reduction_cl*/
+
+static const char scatter_nd_update_reduction_conv_cl[] = "__kernel void scatter_nd_update_reduction_conv_F16(\n\
+    __read_only image2d_t  temp_buf_float,\n\
+    __read_only image2d_t  link_buf,\n\
+    image2d_t  output,\n\
+    int length, int res, float output_scale, float output_zp)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    Image img1 = create_image_from_image2d(temp_buf_float, 4);\n\
+    Image img2 = create_image_from_image2d(output, 2);\n\
+    __global float* input_ptr = (__global float*)img1.ptr;\n\
+    __global short* output_ptr = (__global short*)img2.ptr;\n\
+    if(length > 0)\n\
+    {\n\
+        int offset = gidx * 8;\n\
+        float4 src0 = vload4(0, input_ptr + offset);\n\
+        float4 src1 = vload4(1, input_ptr + offset);\n\
+        half4 data0, data1;\n\
+        _viv_asm(CONV, data0, src0);\n\
+        _viv_asm(CONV, data1, src1);\n\
+        short4 dst0, dst1;\n\
+        _viv_asm(COPY, dst0, data0, 16);\n\
+        _viv_asm(COPY, dst1, data1, 16);\n\
+        vstore4(dst0, 0, output_ptr + offset);\n\
+        vstore4(dst1, 1, output_ptr + offset);\n\
+    }\n\
+    for(int i = gidx; i < res; i += get_global_size(0))\n\
+    {\n\
+        float src = input_ptr[length + i];\n\
+        half data;\n\
+        _viv_asm(CONV, data, src);\n\
+        short dst;\n\
+        _viv_asm(COPY, dst, data, 4);\n\
+        output_ptr[length + i] = dst;\n\
+    }\n\
+}\n\
+\n\
+#define SCATTER_ND_UPDATE_CONV(src0_type, ptr_type, element_size, ptr_type1, conv_func) \\\n\
+__kernel void scatter_nd_update_reduction_conv_##src0_type( \\\n\
+    __read_only image2d_t  temp_buf_float, \\\n\
+    __read_only image2d_t  link_buf, \\\n\
+    image2d_t  output, \\\n\
+    int length, int res, float output_scale, float output_zp) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    Image img1 = create_image_from_image2d(temp_buf_float, 4); \\\n\
+    Image img2 = create_image_from_image2d(output, element_size); \\\n\
+    __global float* input_ptr = (__global float*)img1.ptr; \\\n\
+    __global ptr_type1* output_ptr = (__global ptr_type1*)img2.ptr; \\\n\
+    if(length > 0) \\\n\
+    { \\\n\
+        int offset = gidx * 8; \\\n\
+        float4 src0 = vload4(0, input_ptr + offset); \\\n\
+        float4 src1 = vload4(1, input_ptr + offset); \\\n\
+        int4 data0 = convert_int4_rte(src0 * output_scale + output_zp); \\\n\
+        int4 data1 = convert_int4_rte(src1 * output_scale + output_zp); \\\n\
+        ptr_type dst0, dst1; \\\n\
+        _viv_asm(CONV, dst0, data0); \\\n\
+        _viv_asm(CONV, dst1, data1); \\\n\
+        vstore4(dst0, 0, output_ptr + offset); \\\n\
+        vstore4(dst1, 1, output_ptr + offset); \\\n\
+    } \\\n\
+    for(int i = gidx; i < res; i += get_global_size(0)) \\\n\
+    { \\\n\
+        float src = input_ptr[length + i]; \\\n\
+        int data = convert_int_rte(src * output_scale + output_zp); \\\n\
+        output_ptr[length + i] = conv_func(data); \\\n\
+    } \\\n\
+}\n\
+SCATTER_ND_UPDATE_CONV(U8,  uchar4, 1, uchar, convert_uchar)\n\
+SCATTER_ND_UPDATE_CONV(I8,  char4,  1, char,  convert_char)\n\
+SCATTER_ND_UPDATE_CONV(I16, short4, 2, short, convert_short)\n\
+SCATTER_ND_UPDATE_CONV(F32, float4, 4, float, convert_float)\n\
+"; /* end of scatter_nd_update_reduction_conv_cl*/
+
 static const char select_cl[] = "__kernel void select_I8_U8_U8toU8(\n\
     __read_only  image2d_array_t  condition,\n\
     __read_only  image2d_array_t  input0,\n\
@@ -73818,7 +79085,7 @@ __kernel void swish_I32toI32_2D(\n\
     src = read_imagef(input, coord); \\\n\
     tmp.x = sigmoid_(src.x * beta, logE); \\\n\
     data.x = src.x * tmp.x; \\\n\
-    uint4 dst = convert_uint4(data * outputScale + outputZP); \\\n\
+    uint4 dst = convert_uint4_rte(data * outputScale + outputZP); \\\n\
     write_imageui(output, coord, dst);\n\
 \n\
 __kernel void swish_F32toU8(\n\
@@ -75205,6 +80472,8 @@ static const source_map_t evis_resource[] =
     {"clip_U8_vx", clip_U8_vx},
     {"conv1d_ovxlib_vx", conv1d_ovxlib_vx},
     {"conv1d_ovxlib_k1024_vx", conv1d_ovxlib_k1024_vx},
+    {"crop_and_resize_bilinear_vx", crop_and_resize_bilinear_vx},
+    {"crop_and_resize_nearest_neighbor_vx", crop_and_resize_nearest_neighbor_vx},
     {"cumsum_vx", cumsum_vx},
     {"cumsum_2d_vx", cumsum_2d_vx},
     {"cumsum_bf16_vx", cumsum_bf16_vx},
@@ -75273,11 +80542,20 @@ static const source_map_t evis_resource[] =
     {"layer_normalization_1_vx", layer_normalization_1_vx},
     {"layer_normalization_2_vx", layer_normalization_2_vx},
     {"layer_normalization_3_vx", layer_normalization_3_vx},
+    {"layer_normalization_axis01_0_vx", layer_normalization_axis01_0_vx},
+    {"layer_normalization_axis01_1_vx", layer_normalization_axis01_1_vx},
+    {"layer_normalization_axis01_2_vx", layer_normalization_axis01_2_vx},
+    {"layer_normalization_axis01_3_vx", layer_normalization_axis01_3_vx},
+    {"layer_normalization_axis01_sum_vx", layer_normalization_axis01_sum_vx},
     {"log_softmax_axis0_vx", log_softmax_axis0_vx},
     {"log_softmax_axis0_BF16_vx", log_softmax_axis0_BF16_vx},
     {"log_softmax_axis1_vx", log_softmax_axis1_vx},
     {"log_softmax_axis1_BF16_vx", log_softmax_axis1_BF16_vx},
     {"log_softmax_axis2_vx", log_softmax_axis2_vx},
+    {"log_softmax_exceed_axis0_vx", log_softmax_exceed_axis0_vx},
+    {"log_softmax_exceed_axis0_BF16_vx", log_softmax_exceed_axis0_BF16_vx},
+    {"log_softmax_exceed_axis1_vx", log_softmax_exceed_axis1_vx},
+    {"log_softmax_exceed_axis1_BF16_vx", log_softmax_exceed_axis1_BF16_vx},
     {"logical_not_vx", logical_not_vx},
     {"logical_ops_vx", logical_ops_vx},
     {"lstmunit_activation_BP_BF16_vx", lstmunit_activation_BP_BF16_vx},
@@ -75361,6 +80639,8 @@ static const source_map_t evis_resource[] =
     {"pre_process_gray_2_vx", pre_process_gray_2_vx},
     {"pre_process_gray_copy_vx", pre_process_gray_copy_vx},
     {"pre_process_nv12_copy_vx", pre_process_nv12_copy_vx},
+    {"pre_process_nv12_rggb_copy_vx", pre_process_nv12_rggb_copy_vx},
+    {"pre_process_nv12_rggb_scale_vx", pre_process_nv12_rggb_scale_vx},
     {"pre_process_nv12_scale_vx", pre_process_nv12_scale_vx},
     {"pre_process_rgb_vx", pre_process_rgb_vx},
     {"pre_process_rgb888_planar_0_vx", pre_process_rgb888_planar_0_vx},
@@ -75420,6 +80700,7 @@ static const source_map_t evis_resource[] =
     {"resize_bilinear_F16_vx", resize_bilinear_F16_vx},
     {"resize_bilinear_I16_vx", resize_bilinear_I16_vx},
     {"resize_bilinear_I8_vx", resize_bilinear_I8_vx},
+    {"resize_bilinear_U16_vx", resize_bilinear_U16_vx},
     {"resize_bilinear_U8_vx", resize_bilinear_U8_vx},
     {"resize_bilinear_U8_half_pixel_centers_1_vx", resize_bilinear_U8_half_pixel_centers_1_vx},
     {"resize_bilinear_U8_half_pixel_centers_2_vx", resize_bilinear_U8_half_pixel_centers_2_vx},
@@ -75427,6 +80708,7 @@ static const source_map_t evis_resource[] =
     {"resize_bilinear_align_corners_vx", resize_bilinear_align_corners_vx},
     {"resize_bilinear_nhwc_vx", resize_bilinear_nhwc_vx},
     {"resize_bilinear_nhwc_bound_vx", resize_bilinear_nhwc_bound_vx},
+    {"resize_cubic_vx", resize_cubic_vx},
     {"resize_nearest_vx", resize_nearest_vx},
     {"scatter_nd_vx", scatter_nd_vx},
     {"scatter_nd_big_vx", scatter_nd_big_vx},
@@ -75435,6 +80717,8 @@ static const source_map_t evis_resource[] =
     {"scatter_nd_update_big_vx", scatter_nd_update_big_vx},
     {"scatter_nd_update_fp_vx", scatter_nd_update_fp_vx},
     {"scatter_nd_update_qint_vx", scatter_nd_update_qint_vx},
+    {"scatter_nd_update_reduction_vx", scatter_nd_update_reduction_vx},
+    {"scatter_nd_update_reduction_conv_vx", scatter_nd_update_reduction_conv_vx},
     {"scatter_nd_update_special_vx", scatter_nd_update_special_vx},
     {"select_vx", select_vx},
     {"sequence_mask_vx", sequence_mask_vx},
@@ -75475,6 +80759,8 @@ static const source_map_t cl_resource[] =
     {"clip_F32_cl", clip_F32_cl},
     {"clip_I32_cl", clip_I32_cl},
     {"clip_U8_cl", clip_U8_cl},
+    {"crop_and_resize_bilinear_cl", crop_and_resize_bilinear_cl},
+    {"crop_and_resize_nearest_neighbor_cl", crop_and_resize_nearest_neighbor_cl},
     {"cumsum_cl", cumsum_cl},
     {"cumsum_2d_cl", cumsum_2d_cl},
     {"depth2space_crd_cl", depth2space_crd_cl},
@@ -75511,6 +80797,8 @@ static const source_map_t cl_resource[] =
     {"log_softmax_axis0_cl", log_softmax_axis0_cl},
     {"log_softmax_axis1_cl", log_softmax_axis1_cl},
     {"log_softmax_axis2_cl", log_softmax_axis2_cl},
+    {"log_softmax_exceed_axis0_cl", log_softmax_exceed_axis0_cl},
+    {"log_softmax_exceed_axis1_cl", log_softmax_exceed_axis1_cl},
     {"logical_not_cl", logical_not_cl},
     {"logical_ops_cl", logical_ops_cl},
     {"lppool_cl", lppool_cl},
@@ -75581,6 +80869,7 @@ static const source_map_t cl_resource[] =
     {"resize_3d_bilinear_cl", resize_3d_bilinear_cl},
     {"resize_3d_nearest_cl", resize_3d_nearest_cl},
     {"resize_bilinear_cl", resize_bilinear_cl},
+    {"resize_cubic_cl", resize_cubic_cl},
     {"resize_nearest_cl", resize_nearest_cl},
     {"reversesequence_cl", reversesequence_cl},
     {"roi_align_cl", roi_align_cl},
@@ -75589,6 +80878,8 @@ static const source_map_t cl_resource[] =
     {"scatter_elements_mul_cl", scatter_elements_mul_cl},
     {"scatter_nd_cl", scatter_nd_cl},
     {"scatter_nd_update_cl", scatter_nd_update_cl},
+    {"scatter_nd_update_reduction_cl", scatter_nd_update_reduction_cl},
+    {"scatter_nd_update_reduction_conv_cl", scatter_nd_update_reduction_conv_cl},
     {"select_cl", select_cl},
     {"sequence_mask_cl", sequence_mask_cl},
     {"signal_frame_cl", signal_frame_cl},
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c b/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c
index 6252e4d..b2188f1 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_axis_aligned_bbox_transform.c
@@ -38,60 +38,6 @@
 
 #define _INPUT_NUM          (4)
 #define _OUTPUT_NUM         (1)
-#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
-
-static vsi_status op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-
-    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
-        "axis_aligned_bbox_transform",
-        inputs, _INPUT_NUM,
-        outputs, _OUTPUT_NUM, NULL );
-
-    if ( self->n )
-    {
-        status = VSI_SUCCESS;
-    }
-
-    return status;
-} /* op_compute() */
-
-static vsi_bool op_check
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    /*TODO: Check tensor shapes. */
-    VSI_UNREFERENCED(self);
-    VSI_UNREFERENCED(inputs);
-    VSI_UNREFERENCED(outputs);
-    return TRUE;
-} /* op_check() */
-
-static vsi_bool op_setup
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    VSI_UNREFERENCED(self);
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
-    {
-        outputs[0]->attr.size[0] = inputs[1]->attr.size[0];
-        outputs[0]->attr.size[1] = inputs[1]->attr.size[1];
-        outputs[0]->attr.dim_num = 2;
-    }
-    return TRUE;
-} /* op_setup() */
 
 #ifdef __cplusplus
 extern "C" {
@@ -101,10 +47,10 @@ DEF_OP_REG
     (
     /* op_name    */ AXIS_ALIGNED_BBOX_TRANSFORM,
     /* init       */ NULL,
-    /* compute    */ op_compute,
-    /* deinit     */ vsi_nn_op_common_deinit,
-    /* check      */ op_check,
-    /* setup      */ op_setup,
+    /* compute    */ NULL,
+    /* deinit     */ NULL,
+    /* check      */ NULL,
+    /* setup      */ NULL,
     /* optimize   */ NULL,
     /* input_num  */ _INPUT_NUM,
     /* output_num */ _OUTPUT_NUM
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c
index 7afa231..47ccd79 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c
@@ -370,6 +370,20 @@ static vsi_bool op_setup
         curr->inputs[LSTMUNIT_INPUT_LAYERNORM_C] = inputs[BI_LSTM_FW_INPUT_LAYERNORM_C];
         curr->inputs[LSTMUNIT_INPUT_LAYERNORM_O] = inputs[BI_LSTM_FW_INPUT_LAYERNORM_O];
 
+        if (self->input.num > LSTM_INPUT_BIAS_R2I )
+        {
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2I] = inputs[BI_LSTM_FW_INPUT_BIAS_R2I];
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2F] = inputs[BI_LSTM_FW_INPUT_BIAS_R2F];
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2C] = inputs[BI_LSTM_FW_INPUT_BIAS_R2C];
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2O] = inputs[BI_LSTM_FW_INPUT_BIAS_R2O];
+        }
+        else
+        {
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2I] = NULL;
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2F] = NULL;
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2C] = NULL;
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2O] = NULL;
+        }
         if (has_aux_input)
         {
             curr->inputs[LSTM_INPUT_AUX_INPUT] = aux_reshape_output_tensors[i];
@@ -475,6 +489,21 @@ static vsi_bool op_setup
         curr->inputs[LSTMUNIT_INPUT_LAYERNORM_C] = inputs[BI_LSTM_BW_INPUT_LAYERNORM_C];
         curr->inputs[LSTMUNIT_INPUT_LAYERNORM_O] = inputs[BI_LSTM_BW_INPUT_LAYERNORM_O];
 
+        if (self->input.num > LSTM_INPUT_BIAS_R2I)
+        {
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2I] = inputs[BI_LSTM_BW_INPUT_BIAS_R2I];
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2F] = inputs[BI_LSTM_BW_INPUT_BIAS_R2F];
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2C] = inputs[BI_LSTM_BW_INPUT_BIAS_R2C];
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2O] = inputs[BI_LSTM_BW_INPUT_BIAS_R2O];
+        }
+        else
+        {
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2I] = NULL;
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2F] = NULL;
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2C] = NULL;
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2O] = NULL;
+        }
+
         if (has_aux_input)
         {
             curr->inputs[LSTM_INPUT_AUX_INPUT] = aux_reshape_output_tensors[time_step - 1 - i];
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c
index 34afc98..ecb1640 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c
@@ -267,8 +267,11 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+#define _TENSOR_LEN 64
     vsi_status status;
     vsi_nn_concat_lcl_data * iter;
+    char tensor_name[_TENSOR_LEN];
+    uint32_t sub_id = 0;
 
     status = VSI_SUCCESS;
     self->n = NULL;
@@ -282,6 +285,15 @@ static vsi_status op_compute
         {
             iter->cp_node = vxTensorCopyNode(self->graph->g,
                 iter->src_tensor, iter->dst_tensor );
+            /* Set copy output tensor name */
+            memset(tensor_name, 0, sizeof(tensor_name));
+            snprintf(tensor_name, sizeof(tensor_name), "uid_%u_sub_id_%u_out_0", self->uid, sub_id);
+            if(vxSetReferenceName((vx_reference)iter->dst_tensor, tensor_name) == VSI_FAILURE)
+            {
+                VSILOGW("Set uid %u copy node output name fail", self->uid);
+                return VSI_FAILURE;
+            }
+            sub_id++;
             if( NULL == iter->cp_node )
             {
                 VSILOGE( "Create vxTensorCopyNode fail." );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
index 3a31d44..235ab87 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
@@ -39,6 +39,19 @@
 #include "vsi_nn_internal_node.h"
 #include "vsi_nn_rnn_helper.h"
 
+static vsi_status reshape_activation_output
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t * in_tensor,
+    vsi_nn_tensor_t * out_tensor
+    )
+{
+    vsi_nn_rnn_create_reshape(self, in_tensor, out_tensor,
+                              out_tensor->attr.size, out_tensor->attr.dim_num, TRUE);
+
+    return VSI_SUCCESS;
+} /* reshape_activation_output() */
+
 static vsi_nn_internal_tensor_t * reshape_tensor_to_act
     (
     vsi_nn_node_t* self,
@@ -350,16 +363,19 @@ static vsi_bool op_setup
     )
 {
     uint32_t i;
+    vsi_nn_tensor_attr_t attr;
     vsi_nn_internal_tensor_t * input_conv_outputs[CONV2D_LSTM_CELL_GATE_NUM] = { NULL };
     vsi_nn_internal_tensor_t * recurrent_conv_outputs[CONV2D_LSTM_CELL_GATE_NUM] = { NULL };
     vsi_nn_internal_node_t* curr = NULL;
     vsi_nn_internal_tensor_t * reshape_cell_in = NULL;
-    vsi_nn_internal_tensor_t * reshape_out = NULL;
-    vsi_nn_internal_tensor_t * reshape_h_out = NULL;
-    vsi_nn_internal_tensor_t * reshape_c_out = NULL;
+    vsi_nn_internal_tensor_t * act_out = NULL;
+    vsi_nn_internal_tensor_t * act_h_out = NULL;
+    vsi_nn_internal_tensor_t * act_c_out = NULL;
     vsi_nn_conv2d_lstm_cell_param * p = &self->nn_param.conv2d_lstm_cell;
     vsi_bool ret = FALSE;
+    vsi_status status = VSI_FAILURE;
 
+    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_node_wksp( self );
 
     /* compute output tensor's shapes */
@@ -410,18 +426,27 @@ static vsi_bool op_setup
         curr->inputs[LSTMUNIT_ACT_INPUT_FC_I + i] = input_conv_outputs[i]->t;
         curr->inputs[LSTMUNIT_ACT_HSTATE_FC_I + i] = recurrent_conv_outputs[i]->t;
     }
-    reshape_out = reshape_tensor_to_act(self, outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]);
-    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_out, curr, "Create internal tensor failed", final);
-    reshape_h_out = reshape_tensor_to_act(self, outputs[CONV2D_LSTM_CELL_OUT_H_STATE]);
-    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_h_out, curr, "Create internal tensor failed", final);
-    reshape_c_out = reshape_tensor_to_act(self, outputs[CONV2D_LSTM_CELL_OUT_C_STATE]);
-    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(reshape_c_out, curr, "Create internal tensor failed", final);
 
-    curr->outputs[LSTMUNIT_ACT_OUTPUT] = reshape_out->t;
-    curr->outputs[LSTMUNIT_ACT_CSTATE_OUT] = reshape_c_out->t;
-    curr->outputs[LSTMUNIT_ACT_HSTATE_OUT] = reshape_h_out->t;
+    // create activation output/hstate_output/cstate_output
+    vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]->attr.dtype, TRUE);
+    act_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+    vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_CELL_OUT_H_STATE]->attr.dtype, TRUE);
+    act_h_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+    vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_CELL_OUT_C_STATE]->attr.dtype, TRUE);
+    act_c_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+    curr->outputs[LSTMUNIT_ACT_OUTPUT] = act_out->t;
+    curr->outputs[LSTMUNIT_ACT_HSTATE_OUT] = act_h_out->t;
+    curr->outputs[LSTMUNIT_ACT_CSTATE_OUT] = act_c_out->t;
     vsi_nn_internal_setup_node(self, curr);
 
+    // reshape activation output(2d) to conv2d_lstm_cell output(4d)
+    status = reshape_activation_output(self, act_out->t, outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]);
+    CHECK_STATUS_FAIL_GOTO(status, final);
+    status = reshape_activation_output(self, act_h_out->t, outputs[CONV2D_LSTM_CELL_OUT_H_STATE]);
+    CHECK_STATUS_FAIL_GOTO(status, final);
+    status = reshape_activation_output(self, act_c_out->t, outputs[CONV2D_LSTM_CELL_OUT_C_STATE]);
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
     ret = TRUE;
 final:
     return ret;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu.c
index 85d35df..f51d471 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu.c
@@ -37,294 +37,6 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_constraint_check.h"
 
-static vsi_status op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status;
-    vx_nn_convolution_relu_pooling_params_ext2_t p;
-    status = VSI_FAILURE;
-
-    if(vsi_nn_InitConvReluPoolParameter(self, &p, FALSE) != VSI_SUCCESS)
-    {
-        VSILOGE("SetConvReluParameter fail\n");
-        return VSI_FAILURE;
-    }
-
-    self->n = vxConvolutionReluPoolingLayer2(
-        self->graph->g,
-        inputs[0]->t,
-        inputs[1]->wb,
-        (vx_nn_convolution_relu_pooling_params_t *)&p,
-        sizeof(p),
-        outputs[0]->t
-        );
-
-    vsi_nn_DeinitConvReluPoolParameter( &p );
-
-    if( NULL != self->n )
-    {
-        status = VSI_SUCCESS;
-    }
-    return status;
-} /* op_compute() */
-
-static vsi_bool op_check
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_bool ret = FALSE;
-    BEGIN_IO_TYPE_DECL(CONV_RELU, 3, 1)
-        IO_TYPE(D_F16,  D_F16,  D_NONE, D_F16)
-        IO_TYPE(D_F16,  D_F16,  D_F32, D_F16)
-        IO_TYPE(D_F16,  D_F16,  D_F16, D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_I32|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_I32|Q_DFP, D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_I32|Q_DFP, D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_I32|Q_DFP, D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_NONE, D_I8|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_NONE, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_I32|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_I64|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_I16|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,  D_I32|Q_DFP, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16)
-        IO_TYPE(D_BF16,  D_BF16,  D_F32, D_BF16)
-        IO_TYPE(D_BF16,  D_BF16,  D_F32, D_F32)
-        IO_TYPE(D_BF16,  D_BF16,  D_NONE, D_BF16)
-        IO_TYPE(D_F32,  D_F32,  D_F32, D_F32)
-        IO_TYPE(D_F32,  D_F32,  D_F32, D_BF16)
-        IO_TYPE(D_F32,  D_F32,  D_NONE, D_F32)
-    END_IO_TYPE_DECL(CONV_RELU)
-    if (!VALIDATE_OP_IO_TYPES(CONV_RELU, self, inputs, self->input.num, outputs, self->output.num))
-    {
-        char* desc = generate_op_io_types_desc(inputs,
-                self->input.num, outputs, self->output.num);
-        VSILOGE("Inputs/Outputs data type not support: %s", desc);
-        destroy_op_io_types_desc(desc);
-        return FALSE;
-    }
-
-    /* Check fl and scale*/
-    ret = vsi_nn_QuantCheck(inputs[0], inputs[1], inputs[2]);
-
-    return ret;
-} /* op_check() */
-
-static vsi_bool op_setup
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_bool ret = FALSE;
-
-#ifdef VX_CONVERT_POLICY_WRAP_ENABLE
-    if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 )
-    {
-        self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
-    }
-#endif
-
-    ret = vsi_nn_OpSetup( VSI_NN_OP_CONV2D, self, inputs, outputs );
-
-    return ret;
-} /* op_setup() */
-
-static vsi_status op_optimize
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs,
-    vsi_nn_opt_direction_e direction
-    )
-{
-    vsi_status status;
-    vx_nn_convolution_relu_pooling_params_ext2_t p;
-    vx_weights_biases_parameter_optimizations_t opt;
-    vx_weights_biases_parameter_optimizations_t * p_opt;
-
-    status = VSI_SUCCESS;
-
-    if(direction == VSI_NN_OPTIMIZE_BACKWARD)
-    {
-        return VSI_SUCCESS;
-    }
-
-    VSILOGD("Optimize %s", vsi_nn_OpGetName(self->op));
-    /* Prepare weight_bias */
-    if(inputs[1]->wb == NULL)
-    {
-        if(vsi_nn_InitConvReluPoolParameter(self, &p, FALSE) != VSI_SUCCESS)
-        {
-            VSILOGE("SetConvReluParameter fail\n");
-            return VSI_FAILURE;
-        }
-
-        p_opt = NULL;
-        if( outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
-         || inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
-        {
-            memset( &opt, 0, sizeof( opt ) );
-            opt.inputZeroPoint = inputs[0]->attr.dtype.zero_point;
-            opt.zrl = -1;
-            opt.outputFormat = outputs[0]->attr.dtype.vx_type;
-            p_opt = &opt;
-        }
-
-#ifdef VSI_40BIT_VA_SUPPORT
-        {
-            vx_size size_input0[VSI_NN_MAX_DIM_NUM];
-            vx_size size_output0[VSI_NN_MAX_DIM_NUM];
-            size_t i = 0;
-            for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
-            {
-                size_input0[i] = (vx_size)inputs[0]->attr.size[i];
-                size_output0[i] = (vx_size)outputs[0]->attr.size[i];
-            }
-            inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2(
-                VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER,
-                4,
-                size_input0,
-                size_output0,
-                size_output0,
-                outputs[0]->attr.dtype.vx_type,
-                (vx_nn_convolution_relu_pooling_params_t *)&p,
-                sizeof(p),
-                p_opt,
-                inputs[1]->t, inputs[2]->t
-                );
-        }
-#else
-        {
-            uint32_t size_u32_input0[VSI_NN_MAX_DIM_NUM];
-            uint32_t size_u32_output0[VSI_NN_MAX_DIM_NUM];
-            size_t i = 0;
-            for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
-            {
-                size_u32_input0[i] = (uint32_t)inputs[0]->attr.size[i];
-                size_u32_output0[i] = (uint32_t)outputs[0]->attr.size[i];
-            }
-            inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2(
-                VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER,
-                4,
-                size_u32_input0,
-                size_u32_output0,
-                size_u32_output0,
-                outputs[0]->attr.dtype.vx_type,
-                (vx_nn_convolution_relu_pooling_params_t *)&p,
-                sizeof(p),
-                p_opt,
-                inputs[1]->t, inputs[2]->t
-                );
-        }
-#endif
-        vsi_nn_DeinitConvReluPoolParameter( &p );
-    }
-
-    if( NULL == inputs[1]->wb )
-    {
-        VSILOGE( "Create weight bias fail." );
-        status = VSI_FAILURE;
-    }
-
-    return status;
-} /* op_optimize() */
-
-vsi_status vsi_nn_InitConvReluPoolParameter
-    (
-    vsi_nn_node_t * node,
-    vx_nn_convolution_relu_pooling_params_ext2_t * param_ext2,
-    vsi_bool has_pool
-    )
-{
-    int32_t pad_const_val;
-    vx_scalar pad_const;
-    vx_nn_convolution_relu_pooling_params_t *param;
-    vx_nn_convolution_relu_pooling_params_ext_t *param_ext;
-
-    pad_const_val = 0;
-    pad_const = NULL;
-    param = NULL;
-
-    if( NULL == node || NULL == param_ext2 )
-    {
-        VSILOGE("Set param fail\n");
-        return VSI_FAILURE;
-    }
-    memset( param_ext2, 0, sizeof( vx_nn_convolution_relu_pooling_params_ext2_t ) );
-    param_ext = &param_ext2->ext;
-    param = &param_ext->base;
-
-    pad_const = vxCreateScalar( node->graph->ctx->c, VX_TYPE_INT32, &pad_const_val );
-    if( NULL == pad_const )
-    {
-        VSILOGE("Create scalar fail\n");
-        return VSI_FAILURE;
-    }
-
-    if( node->nn_param.conv2d.dilation[0] > 0 )
-    {
-        param->dilation_x = node->nn_param.conv2d.dilation[0] - 1;
-    }
-    if( node->nn_param.conv2d.dilation[1] > 0 )
-    {
-        param->dilation_y = node->nn_param.conv2d.dilation[1] - 1;
-    }
-    param->pad_x_left    = node->nn_param.conv2d.pad[0];
-    param->pad_x_right   = node->nn_param.conv2d.pad[1];
-    param->pad_y_top     = node->nn_param.conv2d.pad[2];
-    param->pad_y_bottom  = node->nn_param.conv2d.pad[3];
-    param->accumulator_bits = (vx_uint8)node->vx_param.accumulator_bits;
-    param->overflow_policy = node->vx_param.overflow_policy;
-    param->rounding_policy = node->vx_param.rounding_policy;
-    param->down_scale_size_rounding = node->vx_param.down_scale_size_rounding;
-    param->enable_relu = (vx_bool)node->vx_param.has_relu;
-    param->pad_mode = VX_PAD_CONSTANT;
-    param->pad_const = pad_const;
-    if( TRUE == has_pool )
-    {
-        param->pool_type = node->nn_param.pool.type;
-        param->pool_size_x = node->nn_param.pool.ksize[0];
-        param->pool_size_y = node->nn_param.pool.ksize[1];
-    }
-    param_ext->stride_x = node->nn_param.conv2d.stride[0];
-    param_ext->stride_y = node->nn_param.conv2d.stride[1];
-
-    param_ext2->depth_multiplier = node->nn_param.conv2d.multiplier;
-
-    return VSI_SUCCESS;
-} /* vsi_nn_InitConvReluPoolParameter() */
-
-void vsi_nn_DeinitConvReluPoolParameter
-    (
-    vx_nn_convolution_relu_pooling_params_ext2_t * param
-    )
-{
-    if( NULL != param )
-    {
-        if( NULL != param->ext.base.pad_const )
-        {
-            vxReleaseScalar( &param->ext.base.pad_const );
-        }
-    }
-} /* vsi_nn_DeinitConvReluPoolParameter() */
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -333,11 +45,11 @@ DEF_OP_REG
     (
     /* op_name    */ CONV_RELU,
     /* init       */ NULL,
-    /* compute    */ op_compute,
-    /* deinit     */ vsi_nn_op_common_deinit,
-    /* check      */ op_check,
-    /* setup      */ op_setup,
-    /* optimize   */ op_optimize,
+    /* compute    */ NULL,
+    /* deinit     */ NULL,
+    /* check      */ NULL,
+    /* setup      */ NULL,
+    /* optimize   */ NULL,
     /* input_num  */ 3,
     /* output_num */ 1
     );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c
index aef5a68..2ca96c1 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c
@@ -37,250 +37,6 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_constraint_check.h"
 
-static vsi_status op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status;
-    vx_nn_convolution_relu_pooling_params_ext2_t p;
-    status = VSI_FAILURE;
-
-    if(vsi_nn_InitConvReluPoolParameter(self, &p, TRUE) != VSI_SUCCESS)
-    {
-        VSILOGE("SetConvReluPoolParameter fail\n");
-        return VSI_FAILURE;
-    }
-
-    self->n = vxConvolutionReluPoolingLayer2(
-        self->graph->g,
-        inputs[0]->t,
-        inputs[1]->wb,
-        (vx_nn_convolution_relu_pooling_params_t *)&p,
-        sizeof(p),
-        outputs[0]->t
-        );
-
-    vsi_nn_DeinitConvReluPoolParameter( &p );
-
-    if( NULL != self->n )
-    {
-        status = VSI_SUCCESS;
-    }
-    return status;
-} /* op_compute() */
-
-static vsi_bool op_check
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_bool ret = FALSE;
-    BEGIN_IO_TYPE_DECL(CONV_RELU_POOL, 3, 1)
-        IO_TYPE(D_F16,  D_F16,  D_NONE, D_F16)
-        IO_TYPE(D_F16,  D_F16,  D_F32, D_F16)
-        IO_TYPE(D_F16,  D_F16,  D_F16, D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_I32|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_I32|Q_DFP, D_U8|Q_ASYM)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_I32|Q_DFP, D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_I32|Q_DFP, D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_NONE, D_I8|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_NONE, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_I32|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_I64|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP,  D_I16|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,  D_I32|Q_DFP, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16)
-        IO_TYPE(D_BF16,  D_BF16,  D_F32, D_BF16)
-        IO_TYPE(D_BF16,  D_BF16,  D_F32, D_F32)
-        IO_TYPE(D_BF16,  D_BF16,  D_NONE, D_BF16)
-        IO_TYPE(D_F32,  D_F32,  D_F32, D_F32)
-        IO_TYPE(D_F32,  D_F32,  D_F32, D_BF16)
-        IO_TYPE(D_F32,  D_F32,  D_NONE, D_F32)
-    END_IO_TYPE_DECL(CONV_RELU_POOL)
-    if (!VALIDATE_OP_IO_TYPES(CONV_RELU_POOL, self, inputs, self->input.num, outputs, self->output.num))
-    {
-        char* desc = generate_op_io_types_desc(inputs,
-                self->input.num, outputs, self->output.num);
-        VSILOGE("Inputs/Outputs data type not support: %s", desc);
-        destroy_op_io_types_desc(desc);
-        return FALSE;
-    }
-
-    /* Check fl and scale*/
-    ret = vsi_nn_QuantCheck(inputs[0], inputs[1], inputs[2]);
-
-    return ret;
-} /* op_check() */
-
-static vsi_bool op_setup
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_bool ret;
-
-#ifdef VX_CONVERT_POLICY_WRAP_ENABLE
-    if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 )
-    {
-        self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
-    }
-#endif
-
-    ret = TRUE;
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
-    {
-        ret = vsi_nn_OpSetup( VSI_NN_OP_CONV2D, self, inputs, outputs );
-        if(ret == FALSE)
-        {
-            VSILOGE("OpSetup [VSI_NN_OP_CONV2D] fail\n");
-            return FALSE;
-        }
-
-        ret = vsi_nn_OpSetup( VSI_NN_OP_POOL, self, outputs, outputs );
-        if(ret == FALSE)
-        {
-            VSILOGE("OpSetup [VSI_NN_OP_POOL] fail\n");
-            return FALSE;
-        }
-    }
-
-    return ret;
-} /* op_setup() */
-
-static vsi_status op_optimize
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs,
-    vsi_nn_opt_direction_e direction
-    )
-{
-    vsi_status status;
-    vsi_bool ret;
-    vsi_nn_tensor_prv_t conv_out, *pconv_out;
-    vx_nn_convolution_relu_pooling_params_ext2_t p;
-    vx_weights_biases_parameter_optimizations_t opt;
-    vx_weights_biases_parameter_optimizations_t * p_opt;
-    ret = FALSE;
-    status = VSI_FAILURE;
-
-    if(direction == VSI_NN_OPTIMIZE_BACKWARD)
-    {
-        return VSI_SUCCESS;
-    }
-
-    VSILOGD("Optimize %s", vsi_nn_OpGetName(self->op));
-    memset(&conv_out, 0, sizeof(vsi_nn_tensor_prv_t));
-    pconv_out = &conv_out;
-
-    ret = vsi_nn_OpSetup( VSI_NN_OP_CONV2D, self, inputs, (vsi_nn_tensor_t**)(&pconv_out) );
-    if(ret == FALSE)
-    {
-        VSILOGE("OpSetup [VSI_NN_OP_CONV2D] fail\n");
-        goto final;
-    }
-
-    /* Prepare weight_bias */
-    if(inputs[1]->wb == NULL)
-    {
-        if(vsi_nn_InitConvReluPoolParameter(self, &p, TRUE) != VSI_SUCCESS)
-        {
-            VSILOGE("SetConvReluPoolParameter fail\n");
-            goto final;
-        }
-
-        p_opt = NULL;
-        if( outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
-         || inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
-        {
-            memset( &opt, 0, sizeof( opt ) );
-            opt.inputZeroPoint = inputs[0]->attr.dtype.zero_point;
-            opt.zrl = -1;
-            opt.outputFormat = outputs[0]->attr.dtype.vx_type;
-            p_opt = &opt;
-        }
-
-#ifdef VSI_40BIT_VA_SUPPORT
-        {
-            vx_size size_input0[VSI_NN_MAX_DIM_NUM];
-            vx_size size_pconv_out[VSI_NN_MAX_DIM_NUM];
-            vx_size size_output0[VSI_NN_MAX_DIM_NUM];
-            size_t i = 0;
-            for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
-            {
-                size_input0[i] = (vx_size)inputs[0]->attr.size[i];
-                size_pconv_out[i] = (vx_size)pconv_out->pot.attr.size[i];
-                size_output0[i] = (vx_size)outputs[0]->attr.size[i];
-            }
-            inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2(
-                VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER,
-                4,
-                size_input0,
-                size_pconv_out,
-                size_output0,
-                outputs[0]->attr.dtype.vx_type,
-                (vx_nn_convolution_relu_pooling_params_t *)&p,
-                sizeof(p),
-                p_opt,
-                inputs[1]->t, inputs[2]->t
-                );
-        }
-#else
-        {
-            uint32_t size_u32_input0[VSI_NN_MAX_DIM_NUM];
-            uint32_t size_u32_pconv_out[VSI_NN_MAX_DIM_NUM];
-            uint32_t size_u32_output0[VSI_NN_MAX_DIM_NUM];
-            size_t i = 0;
-            for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
-            {
-                size_u32_input0[i] = (uint32_t)inputs[0]->attr.size[i];
-                size_u32_pconv_out[i] = (uint32_t)pconv_out->pot.attr.size[i];
-                size_u32_output0[i] = (uint32_t)outputs[0]->attr.size[i];
-            }
-            inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2(
-                VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER,
-                4,
-                size_u32_input0,
-                size_u32_pconv_out,
-                size_u32_output0,
-                outputs[0]->attr.dtype.vx_type,
-                (vx_nn_convolution_relu_pooling_params_t *)&p,
-                sizeof(p),
-                p_opt,
-                inputs[1]->t, inputs[2]->t
-                );
-        }
-#endif
-        vsi_nn_DeinitConvReluPoolParameter( &p );
-    }
-
-    if( NULL == inputs[1]->wb )
-    {
-        VSILOGE( "Create weight bias fail." );
-    }
-    else
-    {
-        status = VSI_SUCCESS;
-    }
-
-final:
-    return status;
-} /* op_optimize() */
 
 #ifdef __cplusplus
 extern "C" {
@@ -290,11 +46,11 @@ DEF_OP_REG
     (
     /* op_name    */ CONV_RELU_POOL,
     /* init       */ NULL,
-    /* compute    */ op_compute,
-    /* deinit     */ vsi_nn_op_common_deinit,
-    /* check      */ op_check,
-    /* setup      */ op_setup,
-    /* optimize   */ op_optimize,
+    /* compute    */ NULL,
+    /* deinit     */ NULL,
+    /* check      */ NULL,
+    /* setup      */ NULL,
+    /* optimize   */ NULL,
     /* input_num  */ 3,
     /* output_num */ 1
     );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_crop_and_resize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_crop_and_resize.c
new file mode 100644
index 0000000..d41e457
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_crop_and_resize.c
@@ -0,0 +1,193 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_platform.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+typedef struct _crop_and_resize_local_data_t {
+    int32_t placeholder;
+} crop_and_resize_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (3)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t * param = NULL;
+
+    float extrapolation_value = 0;
+    int32_t resize_method = 0;
+
+    if (NULL == self)
+    {
+        return status;
+    }
+
+    extrapolation_value = self->nn_param.crop_and_resize.extrapolation_value;
+    resize_method = self->nn_param.crop_and_resize.resize_method;
+
+    param = vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_int32( param, "resize_method", (int32_t)resize_method );
+    vsi_nn_kernel_param_add_float32( param, "extrapolation_value", (float)extrapolation_value );
+
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "crop_and_resize",
+                                     inputs, _INPUT_NUM,
+                                     outputs, _OUTPUT_NUM, param);
+
+    if ( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(CROP_AND_RESIZE, 3, 1)
+        IO_TYPE(D_U8|Q_ASYM, D_F16,  D_I32,  D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM, D_F16,  D_I32,  D_F16)
+        IO_TYPE(D_U8|Q_ASYM, D_F16,  D_I32,  D_F32)
+        IO_TYPE(D_F16,       D_F16,  D_I32,  D_F16)
+        IO_TYPE(D_F16,       D_F16,  D_I32,  D_I8|Q_DFP)
+        IO_TYPE(D_F16,       D_F16,  D_I32,  D_I8|Q_SYM)
+        IO_TYPE(D_F16,       D_F16,  D_I32,  D_I8|Q_ASYM)
+        IO_TYPE(D_F16,       D_F16,  D_I32,  D_U8|Q_ASYM)
+        IO_TYPE(D_F16,       D_F16,  D_I32,  D_I16|Q_DFP)
+        IO_TYPE(D_F16,       D_F16,  D_I32,  D_I16|Q_SYM)
+        IO_TYPE(D_F32,       D_F16,  D_I32,  D_F32)
+        IO_TYPE(D_F32,       D_F16,  D_I32,  D_I8|Q_DFP)
+        IO_TYPE(D_F32,       D_F16,  D_I32,  D_I8|Q_SYM)
+        IO_TYPE(D_F32,       D_F16,  D_I32,  D_I8|Q_ASYM)
+        IO_TYPE(D_F32,       D_F16,  D_I32,  D_U8|Q_ASYM)
+        IO_TYPE(D_F32,       D_F16,  D_I32,  D_I16|Q_DFP)
+        IO_TYPE(D_F32,       D_F16,  D_I32,  D_I16|Q_SYM)
+        IO_TYPE(D_I8|Q_DFP,  D_F16,  D_I32,  D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_SYM,  D_F16,  D_I32,  D_I8|Q_SYM)
+        IO_TYPE(D_I8|Q_SYM,  D_F16,  D_I32,  D_F16)
+        IO_TYPE(D_I8|Q_ASYM, D_F16,  D_I32,  D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM, D_F16,  D_I32,  D_F16)
+        IO_TYPE(D_I8|Q_DFP,  D_F16,  D_I32,  D_F16)
+        IO_TYPE(D_I8|Q_SYM,  D_F16,  D_I32,  D_F32)
+        IO_TYPE(D_I8|Q_ASYM, D_F16,  D_I32,  D_F32)
+        IO_TYPE(D_I8|Q_DFP,  D_F16,  D_I32,  D_F32)
+        IO_TYPE(D_I16|Q_DFP, D_F16,  D_I32,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_SYM, D_F16,  D_I32,  D_I16|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP, D_F16,  D_I32,  D_F16)
+        IO_TYPE(D_I16|Q_DFP, D_F16,  D_I32,  D_F32)
+        IO_TYPE(D_I16|Q_SYM, D_F16,  D_I32,  D_F16)
+        IO_TYPE(D_I16|Q_SYM, D_F16,  D_I32,  D_F32)
+
+    END_IO_TYPE_DECL(CROP_AND_RESIZE)
+    if (!VALIDATE_OP_IO_TYPES(CROP_AND_RESIZE, self, inputs, self->input.num, outputs, self->output.num))
+    {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_crop_and_resize_param * p = NULL;
+
+    p = (vsi_nn_crop_and_resize_param* )&(self->nn_param.crop_and_resize);
+
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        outputs[0]->attr.size[0] = p->crop_size[1];
+        outputs[0]->attr.size[1] = p->crop_size[0];
+        outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
+        outputs[0]->attr.size[3] = inputs[2]->attr.size[0];
+    }
+    return TRUE;
+}
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t* self
+    )
+{
+    self->nn_param.crop_and_resize.resize_method = VSI_NN_INTERPOLATION_BILINEAR;
+    self->nn_param.crop_and_resize.extrapolation_value = 0;
+
+    return VSI_SUCCESS;
+} /* op_init() */
+
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ CROP_AND_RESIZE,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c
index 6b7cc6f..28556ab 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space_internal.c
@@ -40,6 +40,7 @@
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
 
+#if !(VX_DEPTH2SPACE_CRD_MODE_SUPPORT)
 static vsi_status op_compute
     (
     vsi_nn_node_t * self,
@@ -150,11 +151,25 @@ static vsi_status op_deinit
 
     return VSI_SUCCESS;
 } /* op_deinit() */
-
+#endif
 
 #ifdef __cplusplus
 extern "C" {
 #endif
+#if (VX_DEPTH2SPACE_CRD_MODE_SUPPORT)
+DEF_OP_REG
+    (
+    /* op_name    */ DEPTH2SPACE_INTERNAL,
+    /* init       */ NULL,
+    /* compute    */ NULL,
+    /* deinit     */ NULL,
+    /* check      */ NULL,
+    /* setup      */ NULL,
+    /* optimize   */ NULL,
+    /* input_num  */ 1,
+    /* output_num */ 1
+    );
+#else
 DEF_OP_REG
     (
     /* op_name    */ DEPTH2SPACE_INTERNAL,
@@ -167,6 +182,7 @@ DEF_OP_REG
     /* input_num  */ 1,
     /* output_num */ 1
     );
+#endif
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_detection_postprocess.c b/src/tim/vx/internal/src/ops/vsi_nn_op_detection_postprocess.c
index 726c672..0cd247f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_detection_postprocess.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_detection_postprocess.c
@@ -38,139 +38,6 @@
 
 #define _INPUT_NUM          (3)
 #define _OUTPUT_NUM         (4)
-#define _BOX_INPUT_NUM          (2)
-#define _BOX_OUTPUT_NUM         (1)
-#define _NMS_INPUT_NUM          (2)
-#define _NMS_OUTPUT_NUM         (4)
-
-static vsi_status op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_param_t * param0 = NULL;
-    vsi_nn_kernel_param_t * param1 = NULL;
-    vsi_nn_tensor_t* box_tensors[3] = { NULL };
-    vsi_nn_tensor_t* nms_tensors[6] = { NULL };
-    vsi_nn_tensor_t* bbox_tensor = NULL;
-    vsi_nn_tensor_attr_t attr;
-    vsi_nn_detection_postprocess_param * p = &(self->nn_param.detection_postprocess);
-    float inv_scale_y, inv_scale_x, inv_scale_h, inv_scale_w;
-
-    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
-
-    attr.size[0] = 4;
-    attr.size[1] = inputs[0]->attr.size[1];
-    attr.size[2] = inputs[0]->attr.size[2];
-    attr.dim_num = 3;
-    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
-    attr.is_const = FALSE;
-    attr.vtl = TRUE;
-    bbox_tensor = vsi_nn_CreateTensor( self->graph, &attr );
-
-    inv_scale_y = 1.0f / p->dy;
-    inv_scale_x = 1.0f / p->dx;
-    inv_scale_h = 1.0f / p->dh;
-    inv_scale_w = 1.0f / p->dw;
-
-    if (bbox_tensor)
-    {
-        param0 = vsi_nn_kernel_param_create();
-        vsi_nn_kernel_param_add_float32( param0, "inv_scale_y", inv_scale_y);
-        vsi_nn_kernel_param_add_float32( param0, "inv_scale_x", inv_scale_x);
-        vsi_nn_kernel_param_add_float32( param0, "inv_scale_h", inv_scale_h);
-        vsi_nn_kernel_param_add_float32( param0, "inv_scale_w", inv_scale_w);
-        box_tensors[0] = inputs[1];
-        box_tensors[1] = inputs[2];
-        box_tensors[2] = bbox_tensor;
-        self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "detect_post_box",
-                                                 &box_tensors[0], _BOX_INPUT_NUM,
-                                                 &box_tensors[2], _BOX_OUTPUT_NUM, param0 );
-
-        param1 =vsi_nn_kernel_param_create();
-        vsi_nn_kernel_param_add_int32( param1, "nms_type", p->nms_type);
-        vsi_nn_kernel_param_add_int32( param1, "max_num_detections", p->max_num_detections);
-        vsi_nn_kernel_param_add_int32( param1, "maximum_class_per_detection", p->maximum_class_per_detection);
-        vsi_nn_kernel_param_add_int32( param1, "maximum_detection_per_class", p->maximum_detection_per_class);
-        vsi_nn_kernel_param_add_float32( param1, "score_threshold", p->score_threshold);
-        vsi_nn_kernel_param_add_float32( param1, "iou_threshold", p->iou_threshold);
-        vsi_nn_kernel_param_add_int32( param1, "is_bg_in_label", p->is_bg_in_label);
-        nms_tensors[0] = inputs[0];
-        nms_tensors[1] = bbox_tensor;
-        nms_tensors[2] = outputs[0];
-        nms_tensors[3] = outputs[1];
-        nms_tensors[4] = outputs[2];
-        nms_tensors[5] = outputs[3];
-        self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "detect_post_nms",
-                                                 &nms_tensors[0], _NMS_INPUT_NUM,
-                                                 &nms_tensors[2], _NMS_OUTPUT_NUM, param1 );
-        vsi_nn_ReleaseTensor( &bbox_tensor );
-        vsi_nn_kernel_param_release( &param0 );
-        vsi_nn_kernel_param_release( &param1 );
-    }
-    if( self->n )
-    {
-        status = VSI_SUCCESS;
-    }
-
-    return status;
-} /* op_compute() */
-
-static vsi_bool op_check
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    BEGIN_IO_TYPE_DECL(DETECTION_POSTPROCESS, 3, 1)
-        IO_TYPE(D_F32,  D_F32, D_F32, D_F32)
-        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM, D_U8|Q_ASYM,  D_U8|Q_ASYM)
-    END_IO_TYPE_DECL(DETECTION_POSTPROCESS)
-    if (!VALIDATE_OP_IO_TYPES(DETECTION_POSTPROCESS, self, inputs, self->input.num, outputs, self->output.num))
-    {
-        char* desc = generate_op_io_types_desc(inputs,
-                self->input.num, outputs, self->output.num);
-        VSILOGE("Inputs/Outputs data type not support: %s", desc);
-        destroy_op_io_types_desc(desc);
-        return FALSE;
-    }
-    return TRUE;
-} /* op_check() */
-
-static vsi_bool op_setup
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
-    {
-        vsi_nn_detection_postprocess_param * p;
-        p = &(self->nn_param.detection_postprocess);
-
-        outputs[0]->attr.dim_num = 2;
-        outputs[0]->attr.size[0] = p->max_num_detections;
-        outputs[0]->attr.size[1] = inputs[0]->attr.size[2];
-
-        outputs[1]->attr.dim_num = 3;
-        outputs[1]->attr.size[0] = 4;
-        outputs[1]->attr.size[1] = p->max_num_detections;
-        outputs[1]->attr.size[2] = inputs[0]->attr.size[2];
-
-        outputs[2]->attr.dim_num = 2;
-        outputs[2]->attr.size[0] = p->max_num_detections;
-        outputs[2]->attr.size[1] = inputs[0]->attr.size[2];
-
-        outputs[3]->attr.dim_num = 1;
-        outputs[3]->attr.size[0] = inputs[0]->attr.size[2];
-    }
-    return TRUE;
-} /* op_setup() */
 
 #ifdef __cplusplus
 extern "C" {
@@ -180,10 +47,10 @@ DEF_OP_REG
     (
     /* op_name    */ DETECTION_POSTPROCESS,
     /* init       */ NULL,
-    /* compute    */ op_compute,
-    /* deinit     */ vsi_nn_op_common_deinit,
-    /* check      */ op_check,
-    /* setup      */ op_setup,
+    /* compute    */ NULL,
+    /* deinit     */ NULL,
+    /* check      */ NULL,
+    /* setup      */ NULL,
     /* optimize   */ NULL,
     /* input_num  */ _INPUT_NUM,
     /* output_num */ _OUTPUT_NUM
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
index a7bc5d1..44e051e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
@@ -79,8 +79,11 @@ static vsi_status _eltwise_op_compute
     if ( strcmp(kernel_name, "sub") == 0
       || strcmp(kernel_name, "add") == 0
       || strcmp(kernel_name, "mul") == 0
-      || (strcmp(kernel_name, "maximum") == 0 && !is_executed_on_sh)
-      || (strcmp(kernel_name, "minimum") == 0 && !is_executed_on_sh)
+#if VX_TENSOR_POW_API_SUPPORT
+      || strcmp(kernel_name, "pow") == 0
+#endif
+      || (strcmp(kernel_name, "maximum") == 0)
+      || (strcmp(kernel_name, "minimum") == 0)
       || (strcmp(kernel_name, "div") == 0 && !is_executed_on_sh))
     {
         doShapeOptimized = FALSE;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
index 280e5ee..708c748 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
@@ -275,6 +275,7 @@ DEF_ELEMENT_WISE_UNARY_OP( ATAN, atan );
 DEF_ELEMENT_WISE_UNARY_OP( ATANH, atanh );
 DEF_ELEMENT_WISE_UNARY_OP( ACOSH, acosh );
 DEF_ELEMENT_WISE_UNARY_OP( INVERSE_SIGMOID, inverse_sigmoid );
+DEF_ELEMENT_WISE_UNARY_OP( TAN, tan );
 
 #undef DEF_ELEMENT_UNARY_WISE_OP
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c
index b91fec8..484f6a6 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c
@@ -37,311 +37,6 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_constraint_check.h"
 
-static vsi_status _set_fc_relu_parameter
-    (
-    vsi_nn_node_t * self,
-    vx_nn_convolution_relu_pooling_params_t * param
-    );
-
-static vsi_status _set_fc_relu_parameter
-    (
-    vsi_nn_node_t * self,
-    vx_nn_convolution_relu_pooling_params_t * param
-    )
-{
-    vx_scalar pad_const;
-    int32_t pad_const_val;
-
-    pad_const_val = 0;
-    memset( param, 0, sizeof(vx_nn_convolution_relu_pooling_params_t) );
-    pad_const = vxCreateScalar(self->graph->ctx->c, VX_TYPE_INT32, &pad_const_val);
-    if( !pad_const )
-    {
-        VSILOGE("Create scalar fail\n");
-        return VSI_FAILURE;
-    }
-
-    param->pad_x_left    = 0;
-    param->pad_x_right   = 0;
-    param->pad_y_top     = 0;
-    param->pad_y_bottom  = 0;
-    param->dilation_x    = 0;
-    param->dilation_y    = 0;
-    param->accumulator_bits = (vx_uint8)self->vx_param.accumulator_bits;
-    param->overflow_policy = self->vx_param.overflow_policy;
-    param->rounding_policy = self->vx_param.rounding_policy;
-    param->down_scale_size_rounding = self->vx_param.down_scale_size_rounding;
-    param->enable_relu = self->vx_param.has_relu;
-    param->pool_type = 0;
-    param->pool_size_x = 0;
-    param->pool_size_y = 0;
-    param->pad_mode = VX_PAD_CONSTANT;
-    param->pad_const = pad_const;
-
-    return VSI_SUCCESS;
-} /* _set_fc_relu_parameter() */
-
-static vsi_status op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status;
-    status = VSI_FAILURE;
-
-    self->n = vxFullyConnectedReluLayer(
-        self->graph->g,
-        inputs[0]->t,
-        inputs[1]->wb,
-        0,
-        0,
-        self->vx_param.overflow_policy,
-        self->vx_param.rounding_policy,
-        self->vx_param.down_scale_size_rounding,
-        self->vx_param.has_relu,
-        outputs[0]->t
-        );
-
-    if( NULL != self->n )
-    {
-        status = VSI_SUCCESS;
-    }
-    return status;
-} /* op_compute() */
-
-static vsi_bool op_check
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_bool ret = FALSE;
-
-    /* Check fl and scale*/
-    ret = vsi_nn_QuantCheck(inputs[0], inputs[1], inputs[2]);
-
-    if(ret) {
-        /* check inputs outputs data type */
-        /* NN Support */
-        BEGIN_IO_TYPE_DECL(FCL_RELU, 3, 1)
-            /* IO_TYPE(INPUT, WEIGHT, BIAS, OUTPUT) */
-            /* NN Support - I8 */
-            IO_TYPE(D_I8|Q_SYM_PC, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_SYM_PC)
-            IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_ASYM)
-
-            IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16)
-
-            IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_F16)
-
-            /* NN Support - U8 */
-            IO_TYPE(D_U8|Q_SYM_PC, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_SYM_PC)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM)
-
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16)
-
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16)
-
-            /* NN Support - I16 */
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP)
-
-            /* NN Support - F16 */
-            IO_TYPE(D_F16, D_F16, D_F16, D_F16)
-            IO_TYPE(D_F16, D_F16, D_F16, D_I8|Q_DFP)
-            IO_TYPE(D_F16, D_F16, D_F16, D_U8|Q_ASYM)
-
-            /* NN Support - BF16 */
-            IO_TYPE(D_BF16, D_BF16, D_F32, D_BF16)
-            IO_TYPE(D_BF16, D_BF16, D_F32, D_F32)
-
-            /* NN Support - F32 */
-            IO_TYPE(D_F32, D_BF16, D_F32, D_F32)
-            IO_TYPE(D_F32, D_BF16, D_F32, D_BF16)
-            /* HW 9.0.1 */
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_I8|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_F16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_BF16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_F32)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U8|Q_ASYM)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I8|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
-
-        END_IO_TYPE_DECL(FCL_RELU)
-        ret = VALIDATE_OP_IO_TYPES(FCL_RELU, self, inputs, self->input.num, outputs, self->output.num);
-
-        /* TP Support */
-        if (!ret ) {
-            uint32_t valid_dtypes[] = {
-                D_F16, D_BF16, D_F32, D_I16|Q_DFP, D_I16|Q_SYM, D_I16|Q_ASYM, D_I8|Q_DFP, D_I8|Q_SYM,
-                D_I8|Q_ASYM, D_U8|Q_DFP, D_U8|Q_ASYM
-            };
-
-            uint32_t weight_type = inputs[1]->attr.dtype.vx_type | inputs[1]->attr.dtype.qnt_type << Q_SHIFT;
-            uint32_t inputs_types[3] = { 0 };
-            vsi_bool supported[3] = { FALSE, FALSE, FALSE };
-            int i = 0;
-
-            inputs_types[0] = inputs[0]->attr.dtype.vx_type | inputs[0]->attr.dtype.qnt_type << Q_SHIFT;
-            inputs_types[2] = outputs[0]->attr.dtype.vx_type | outputs[0]->attr.dtype.qnt_type << Q_SHIFT;
-            if (inputs[2]) {
-                switch(inputs[1]->attr.dtype.vx_type) {
-                    case D_F16:
-                    case D_BF16:
-                    case D_F32:
-                        if(inputs[2]->attr.dtype.vx_type == (vsi_nn_type_e)D_F32) {
-                            inputs_types[1] = weight_type;
-                        }
-                        break;
-                    case D_I16:
-                    case D_I8:
-                    case D_U8:
-                        if (inputs[2]->attr.dtype.vx_type == (vsi_nn_type_e)D_I32 ||
-                                inputs[2]->attr.dtype.vx_type == (vsi_nn_type_e)D_I64) {
-                            inputs_types[1] = weight_type;
-                        }
-                        break;
-                    default:
-                        break;
-                }
-            } else {
-                inputs_types[1] = weight_type;
-            }
-
-            for (i = 0; i < 3; i++) {
-                supported[i] = is_item_in_array(&inputs_types[i], valid_dtypes,
-                        sizeof(uint32_t), _cnt_of_array(valid_dtypes));
-            }
-
-            ret = supported[0] && supported[1] && supported[2];
-        }
-
-        if(!ret) {
-            char* desc = generate_op_io_types_desc(inputs,
-                    self->input.num, outputs, self->output.num);
-            VSILOGE("Inputs/Outputs data type not support: %s", desc);
-            vsi_nn_safe_free(desc);
-            return FALSE;
-        }
-    }
-    return ret;
-} /* op_check() */
-
-static vsi_bool op_setup
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_bool ret;
-    vx_nn_convolution_relu_pooling_params_t p;
-    vx_weights_biases_parameter_optimizations_ext_t opt;
-    vx_weights_biases_parameter_optimizations_ext_t * p_opt;
-
-#ifdef VX_CONVERT_POLICY_WRAP_ENABLE
-    if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 )
-    {
-        self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
-    }
-#endif
-
-    ret = vsi_nn_OpSetup( VSI_NN_OP_FCL, self, inputs, outputs );
-
-    /* Prepare weight_bias */
-    if(inputs[1]->wb == NULL)
-    {
-        if( _set_fc_relu_parameter( self, &p ) != VSI_SUCCESS )
-        {
-            VSILOGE("set fc_relu weightbias parameter fail\n");
-            return FALSE;
-        }
-
-        p_opt = NULL;
-        memset( &opt, 0, sizeof( opt ) );
-        if( outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
-         || inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
-        {
-            opt.inputZeroPoint = inputs[0]->attr.dtype.zero_point;
-        }
-        opt.zrl = -1;
-        opt.outputFormat = outputs[0]->attr.dtype.vx_type;
-        opt.num_of_input_dims = inputs[0]->attr.dim_num;
-        opt.num_of_output_dims = outputs[0]->attr.dim_num;
-        p_opt = &opt;
-
-#ifdef VSI_40BIT_VA_SUPPORT
-        {
-            vx_size size_input0[VSI_NN_MAX_DIM_NUM];
-            vx_size size_output0[VSI_NN_MAX_DIM_NUM];
-            size_t i = 0;
-            for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
-            {
-                size_input0[i] = (vx_size)inputs[0]->attr.size[i];
-                size_output0[i] = (vx_size)outputs[0]->attr.size[i];
-            }
-            inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors3(
-                VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER,
-                size_input0,
-                size_output0,
-                size_output0,
-                &p,
-                sizeof(p),
-                (vx_weights_biases_parameter_optimizations_t *)p_opt,
-                sizeof(opt),
-                inputs[1]->t, inputs[2]->t
-                );
-        }
-#else
-        {
-            uint32_t size_u32_input0[VSI_NN_MAX_DIM_NUM];
-            uint32_t size_u32_output0[VSI_NN_MAX_DIM_NUM];
-            size_t i = 0;
-            for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
-            {
-                size_u32_input0[i] = (uint32_t)inputs[0]->attr.size[i];
-                size_u32_output0[i] = (uint32_t)outputs[0]->attr.size[i];
-            }
-            inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors3(
-                VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER,
-                size_u32_input0,
-                size_u32_output0,
-                size_u32_output0,
-                &p,
-                sizeof(p),
-                (vx_weights_biases_parameter_optimizations_t *)p_opt,
-                sizeof(opt),
-                inputs[1]->t, inputs[2]->t
-                );
-        }
-#endif
-        if( p.pad_const )
-        {
-            vxReleaseScalar( &p.pad_const );
-        }
-    }
-
-    if( NULL == inputs[1]->wb )
-    {
-        VSILOGE( "Create weight bias fail." );
-        ret = FALSE;
-    }
-
-    return ret;
-} /* op_setup() */
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -350,10 +45,10 @@ DEF_OP_REG
     (
     /* op_name    */ FCL_RELU,
     /* init       */ NULL,
-    /* compute    */ op_compute,
-    /* deinit     */ vsi_nn_op_common_deinit,
-    /* check      */ op_check,
-    /* setup      */ op_setup,
+    /* compute    */ NULL,
+    /* deinit     */ NULL,
+    /* check      */ NULL,
+    /* setup      */ NULL,
     /* optimize   */ NULL,
     /* input_num  */ 3,
     /* output_num */ 1
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
index be32f48..e7d9358 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
@@ -164,13 +164,14 @@ static vsi_bool op_setup
     /* TODO: Add code to comput outputs' shape. */
     uint32_t i = 0;
     vsi_nn_gather_param * p = NULL;
+    uint32_t batch_dims = (uint32_t)self->nn_param.gather.batch_dims;
 
     if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         uint32_t j = 0;
-        uint32_t r_rank = vsi_nn_GetTensorIsScalar(inputs[0]) ? 0 : inputs[0]->attr.dim_num;
-        uint32_t q_rank = vsi_nn_GetTensorIsScalar(inputs[1]) ? 0 : inputs[1]->attr.dim_num;
-        uint32_t o_rank = r_rank + q_rank - 1;
+        uint32_t r_rank = vsi_nn_GetTensorIsScalar(inputs[0]) ? 0 : ((uint32_t)inputs[0]->attr.dim_num - batch_dims);
+        uint32_t q_rank = vsi_nn_GetTensorIsScalar(inputs[1]) ? 0 : ((uint32_t)inputs[1]->attr.dim_num - batch_dims);
+        uint32_t o_rank = r_rank + q_rank - 1 + batch_dims;
 
         p = &(self->nn_param.gather);
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c b/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c
index 09e96a1..ea22bf7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c
@@ -38,86 +38,6 @@
 
 #define _INPUT_NUM          (4)
 #define _OUTPUT_NUM         (3)
-#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
-
-static vsi_status op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_param_t * param = NULL;
-
-    param = vsi_nn_kernel_param_create();
-
-    vsi_nn_kernel_param_add_float32( param, "height_stride", self->nn_param.generate_proposals.height_stride );
-    vsi_nn_kernel_param_add_float32( param, "width_stride", self->nn_param.generate_proposals.width_stride );
-    vsi_nn_kernel_param_add_int32( param, "pre_nms_top_n", self->nn_param.generate_proposals.pre_nms_top_n);
-    vsi_nn_kernel_param_add_int32( param, "post_nms_top_n", self->nn_param.generate_proposals.post_nms_top_n);
-    vsi_nn_kernel_param_add_float32( param, "iou_threshold", self->nn_param.generate_proposals.iou_threshold );
-    vsi_nn_kernel_param_add_float32( param, "min_size", self->nn_param.generate_proposals.min_size );
-
-    self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "cpu beckend conv2d",
-        inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
-
-    if( self->n )
-    {
-        status = VSI_SUCCESS;
-    }
-
-    vsi_nn_kernel_param_release( &param );
-    return status;
-} /* op_compute() */
-
-static vsi_bool op_check
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    VSI_UNREFERENCED(self);
-    VSI_UNREFERENCED(inputs);
-    VSI_UNREFERENCED(outputs);
-    return TRUE;
-} /* op_check() */
-
-static vsi_bool op_setup
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
-    {
-        vsi_nn_generate_proposals_param * p;
-        int32_t num_output_rois;
-        p = &(self->nn_param.generate_proposals);
-        num_output_rois = (int32_t)vsi_nn_GetElementNum(inputs[0]);
-        if(p->pre_nms_top_n > 0)
-        {
-            num_output_rois = p->pre_nms_top_n;
-        }
-        if(p->post_nms_top_n > 0)
-        {
-            num_output_rois = p->post_nms_top_n;
-        }
-
-        outputs[0]->attr.dim_num = 1;
-        outputs[0]->attr.size[0] = num_output_rois;
-
-        outputs[1]->attr.dim_num = 2;
-        outputs[1]->attr.size[0] = 4;
-        outputs[1]->attr.size[1] = num_output_rois;
-
-        outputs[2]->attr.dim_num = 1;
-        outputs[2]->attr.size[0] = num_output_rois;
-    }
-    return TRUE;
-} /* op_setup() */
 
 #ifdef __cplusplus
 extern "C" {
@@ -127,10 +47,10 @@ DEF_OP_REG
     (
     /* op_name    */ GENERATE_PROPOSALS,
     /* init       */ NULL,
-    /* compute    */ op_compute,
-    /* deinit     */ vsi_nn_op_common_deinit,
-    /* check      */ op_check,
-    /* setup      */ op_setup,
+    /* compute    */ NULL,
+    /* deinit     */ NULL,
+    /* check      */ NULL,
+    /* setup      */ NULL,
     /* optimize   */ NULL,
     /* input_num  */ _INPUT_NUM,
     /* output_num */ _OUTPUT_NUM
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c
index a404979..55f0ef3 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c
@@ -37,13 +37,17 @@
 #include "vsi_nn_internal_node.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_tensor_util_prv.h"
 #include "vsi_nn_error.h"
 
 #define _INPUT_NUM          (3)
 #define _OUTPUT_NUM         (1)
 
 static vsi_nn_tensor_t * _expand_tensor_dim
-    ( vsi_nn_graph_t * graph, vsi_nn_tensor_t *tensor, vsi_size_t * shape, vsi_size_t rank, vsi_size_t expand_dim )
+    ( vsi_nn_graph_t * graph, vsi_nn_tensor_t *tensor,
+        vsi_size_t * shape, vsi_size_t rank, vsi_size_t expand_dim,
+        vsi_nn_opt_direction_e direction,
+        vsi_bool is_use_reshpe_node)
 {
     vsi_size_t new_shape[VSI_NN_MAX_DIM_NUM] = { 0 };
     vsi_size_t i, cnt;
@@ -66,8 +70,14 @@ static vsi_nn_tensor_t * _expand_tensor_dim
     {
         new_shape[cnt] = 1;
     }
-
-    return vsi_nn_reshape_tensor( graph, tensor, new_shape, rank + 1 );
+    if (is_use_reshpe_node)
+    {
+        return vsi_nn_kernel_insert_reshape_node(graph, tensor, new_shape, (uint32_t)(rank + 1), direction);
+    }
+    else
+    {
+        return vsi_nn_reshape_tensor(graph, tensor, new_shape, rank + 1);
+    }
 } /* _expand_tensor_dim() */
 
 static vsi_status op_compute
@@ -127,13 +137,13 @@ static vsi_bool op_setup
     }
 
     p->local->input = _expand_tensor_dim( self->graph, inputs[0],
-            inputs[0]->attr.size, inputs[0]->attr.dim_num, 0 );
+            inputs[0]->attr.size, inputs[0]->attr.dim_num, 0, VSI_NN_OPTIMIZE_BACKWARD, TRUE);
     if (inputs[1]->attr.dtype.qnt_type !=
             VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC &&
         inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8)
     {
         p->local->weight = _expand_tensor_dim( self->graph, inputs[1],
-            inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 );
+            inputs[1]->attr.size, inputs[1]->attr.dim_num, 0, VSI_NN_OPTIMIZE_BACKWARD, FALSE);
     }
     else
     {
@@ -160,7 +170,7 @@ static vsi_bool op_setup
     }
 
     p->local->output = _expand_tensor_dim( self->graph, outputs[0],
-            outputs[0]->attr.size, outputs[0]->attr.dim_num, 0 );
+            outputs[0]->attr.size, outputs[0]->attr.dim_num, 0, VSI_NN_OPTIMIZE_FORWARD, TRUE);
 
 
     curr = vsi_nn_internal_new_node(self, VSI_NN_OP_GROUPED_CONV2D, 0, 0);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c
index 772af14..72fadad 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c
@@ -61,8 +61,10 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
+#define _TENSOR_LEN 64
     vsi_bool res;
     uint32_t i;
+    char tensor_name[_TENSOR_LEN];
     vsi_nn_grouped_conv2d_param *nn_param = &self->nn_param.grouped_conv2d;
     nn_param->local = (vsi_nn_grouped_conv2d_param_local_data*)malloc(
         sizeof(vsi_nn_grouped_conv2d_param_local_data));
@@ -197,6 +199,14 @@ static vsi_status op_compute
             sizeof(vx_nn_convolution_params_ext2_t),
             LOCAL()->output_tensor_group[i]->t
             );
+
+        memset(tensor_name, 0, sizeof(tensor_name));
+        snprintf(tensor_name, sizeof(tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, i);
+        if(vxSetReferenceName((vx_reference)LOCAL()->output_tensor_group[i]->t, tensor_name) == VSI_FAILURE)
+        {
+            VSILOGW("Set uid %u copy node output name fail", self->uid);
+            return VSI_FAILURE;
+        }
         if( NULL == self->n )
         {
             VSILOGE("Add vxConvolutionLayer fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
index 629486c..31f7abc 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
@@ -36,6 +36,7 @@
 #include "vsi_nn_log.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
 
 #define _INPUT_NUM          (3)
 #define _OUTPUT_NUM         (1)
@@ -85,6 +86,54 @@ static vsi_bool _is_3d_group_norm
     return FALSE;
 } /* _is_3d_group_norm() */
 
+static vsi_nn_tensor_t* _pad_tensor_per_pixel
+    (
+    vsi_nn_graph_t  *graph,
+    vsi_nn_tensor_t *input_tensor,
+    vsi_size_t       scale_size_in,
+    vsi_size_t       pad_size_per_pixel
+    )
+{
+    float* f32_in_buffer   = NULL;
+    float* f32_out_buffer  = NULL;
+    vsi_size_t  i = 0, j = 0;
+    vsi_nn_tensor_attr_t attr;
+    vsi_nn_tensor_t*  output_tensor  = NULL;
+
+    f32_out_buffer= (float *)malloc(pad_size_per_pixel * scale_size_in * sizeof(float));
+    CHECK_PTR_FAIL_GOTO( f32_out_buffer, "Create buffer fail.", final );
+    memset(f32_out_buffer, 0, pad_size_per_pixel * scale_size_in * sizeof(float));
+    f32_in_buffer = vsi_nn_ConvertTensorToFloat32Data(graph, input_tensor);
+    if (NULL == f32_in_buffer)
+    {
+        output_tensor = NULL;
+        goto final;
+    }
+
+    for ( i = 0; i < scale_size_in; i++ )
+    {
+        for (j = 0; j < pad_size_per_pixel; j ++)
+        {
+            f32_out_buffer[i * pad_size_per_pixel + j] = f32_in_buffer[i];
+        }
+    }
+
+    memcpy(&attr, &input_tensor->attr, sizeof(vsi_nn_tensor_attr_t));
+    attr.size[0] = pad_size_per_pixel;
+    attr.size[1] = scale_size_in;
+    attr.dim_num = 2;
+    output_tensor = vsi_nn_CreateTensorFromData(
+        graph,
+        (uint8_t *)f32_out_buffer,
+        &attr);
+    CHECK_PTR_FAIL_GOTO( output_tensor, "Create tensor fail.", final );
+final:
+    vsi_nn_safe_free(f32_in_buffer)
+    vsi_nn_safe_free(f32_out_buffer)
+
+    return output_tensor;
+}
+
 static vsi_status _op_compute
     (
     vsi_nn_node_t * self,
@@ -100,6 +149,7 @@ static vsi_status _op_compute
     vsi_nn_tensor_t * tmp_inputs[3]  = {NULL, NULL, NULL};
     vsi_nn_tensor_t * tmp_outputs[1] = {NULL};
     vsi_nn_groupnorm_lcl_data *local = self->nn_param.groupnorm.lcl_data;
+    vsi_bool pad_scale_bias = FALSE;
 
     status = _try_set_high_presision_tensor(inputs);
     if (status != VSI_SUCCESS)
@@ -123,6 +173,19 @@ static vsi_status _op_compute
         tmp_inputs[2] = inputs[2];
     }
 
+    pad_scale_bias = vsi_nn_GetElementNum(inputs[1]) == (vsi_size_t)group_num &&
+        (vsi_size_t)group_num < tmp_inputs[0]->attr.size[2];
+
+    if (pad_scale_bias)
+    {
+        tmp_inputs[1] = _pad_tensor_per_pixel(self->graph, tmp_inputs[1],
+            group_num, tmp_inputs[0]->attr.size[2] / group_num);
+        tmp_inputs[2] = _pad_tensor_per_pixel(self->graph, tmp_inputs[2],
+            group_num, tmp_inputs[0]->attr.size[2] / group_num);
+        CHECK_PTR_FAIL_GOTO( tmp_inputs[1], "Create tensor fail.", final );
+        CHECK_PTR_FAIL_GOTO( tmp_inputs[2], "Create tensor fail.", final );
+    }
+
     param = vsi_nn_kernel_param_create();
     vsi_nn_kernel_param_add_float32( param, "eps", eps );
     vsi_nn_kernel_param_add_int32( param, "group_num", group_num );
@@ -139,6 +202,13 @@ static vsi_status _op_compute
         vsi_nn_kernel_param_release( &param );
     }
 
+final:
+    if (pad_scale_bias)
+    {
+        vsi_safe_release_tensor(tmp_inputs[1]);
+        vsi_safe_release_tensor(tmp_inputs[2]);
+    }
+
     return status;
 } /* op_compute() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c b/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c
index cc4b443..e81dd75 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_heatmap_max_keypoint.c
@@ -39,69 +39,6 @@
 #define _INPUT_NUM          (2)
 #define _OUTPUT_NUM         (2)
 
-static vsi_status op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_FAILURE;
-
-    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
-        "heatmap_max_keypoint",
-        inputs, _INPUT_NUM,
-        outputs, _OUTPUT_NUM, NULL );
-
-    if( self->n )
-    {
-        status = VSI_SUCCESS;
-    }
-
-    return status;
-}
-
-static vsi_bool op_check
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    VSI_UNREFERENCED(self);
-    VSI_UNREFERENCED(inputs);
-    VSI_UNREFERENCED(outputs);
-    /*TODO: Check tensor shapes. */
-    return TRUE;
-} /* op_check() */
-
-static vsi_bool op_setup
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    VSI_UNREFERENCED(self);
-
-    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
-    {
-        outputs[0]->attr.dim_num = 2;
-        outputs[0]->attr.size[0] = inputs[0]->attr.size[0];
-        outputs[0]->attr.size[1] = inputs[0]->attr.size[3];
-    }
-
-    if ( VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num )
-    {
-        outputs[1]->attr.dim_num = 3;
-        outputs[1]->attr.size[0] = 2;
-        outputs[1]->attr.size[1] = inputs[0]->attr.size[0];
-        outputs[1]->attr.size[2] = inputs[0]->attr.size[3];
-    }
-
-    return TRUE;
-} /* op_setup() */
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -110,10 +47,10 @@ DEF_OP_REG
     (
     /* op_name    */ HEATMAP_MAX_KEYPOINT,
     /* init       */ NULL,
-    /* compute    */ op_compute,
-    /* deinit     */ vsi_nn_op_common_deinit,
-    /* check      */ op_check,
-    /* setup      */ op_setup,
+    /* compute    */ NULL,
+    /* deinit     */ NULL,
+    /* check      */ NULL,
+    /* setup      */ NULL,
     /* optimize   */ NULL,
     /* input_num  */ _INPUT_NUM,
     /* output_num */ _OUTPUT_NUM
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c b/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c
index 5386af7..5be9bb1 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_imageprocess.c
@@ -34,91 +34,11 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
 
-struct _scaletotensor_kernel_params
-{
-    int32_t ratio[2];
-    int32_t offset[2];
-    float mean[3];
-    float scale;
-};
-
-typedef struct _scaletotensor_kernel_params scaletotensor_kernel_params_t;
-
-
-static vsi_status op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status = VSI_SUCCESS;
-
-    VSI_UNREFERENCED(self);
-    VSI_UNREFERENCED(inputs);
-    VSI_UNREFERENCED(outputs);
-
-    return status;
-} /* op_compute() */
-
-static vsi_bool op_setup
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_nn_imageprocess_param * p;
-    uint32_t i;
-    p = (vsi_nn_imageprocess_param *)&(self->nn_param.imageprocess);
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
-    {
-        /* TODO */
-        if (inputs[0]->attr.dim_num != 4)
-        {
-            VSILOGE("Only support 4D tensor for image process!(IMAGEPROCESS)\n");
-            return FALSE;
-        }
-        if (p->reverse_channel == TRUE && inputs[0]->attr.size[2] != 3)
-        {
-            VSILOGE("Only support 3 channels for reverse channel!(IMAGEPROCESS)\n");
-            return FALSE;
-        }
-
-        if (p->resize.type != VSI_NN_IMAGEPROCESS_RESIZE_NONE)
-        {
-            outputs[0]->attr.dim_num = p->resize.dim_num;
-            for(i = 0; i < (uint32_t)p->resize.dim_num; ++i)
-            {
-                outputs[0]->attr.size[i] = p->resize.length[i];
-            }
-        }
-        else if (p->crop.enable == TRUE)
-        {
-            outputs[0]->attr.dim_num = p->crop.dim_num;
-            for(i = 0; i < (uint32_t)p->crop.dim_num; ++i)
-            {
-                outputs[0]->attr.size[i] = p->crop.length[i];
-            }
-        }
-        else
-        {
-            // CWHN -> WHCN
-            outputs[0]->attr.size[0] = inputs[0]->attr.size[1];
-            outputs[0]->attr.size[1] = inputs[0]->attr.size[2];
-            outputs[0]->attr.size[2] = inputs[0]->attr.size[0];
-            outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
-        }
-    }
-    return TRUE;
-} /* op_setup() */
-
 vsi_status vsi_nn_op_imageprocess_single_node
     (
     vsi_nn_graph_t *graph,
@@ -150,10 +70,10 @@ DEF_OP_REG
     (
     /* op_name    */ IMAGEPROCESS,
     /* init       */ NULL,
-    /* compute    */ op_compute,
-    /* deinit     */ vsi_nn_op_common_deinit,
+    /* compute    */ NULL,
+    /* deinit     */ NULL,
     /* check      */ NULL,
-    /* setup      */ op_setup,
+    /* setup      */ NULL,
     /* optimize   */ NULL,
     /* input_num  */ _INPUT_NUM,
     /* output_num */ _OUTPUT_NUM
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
index 53c12ae..487e89c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
@@ -131,15 +131,15 @@ static vsi_status op_compute
 
     vsi_nn_optimize_instance_norm_shape(inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank);
 
-    tmp_tensors[0] = vsi_nn_reshape_tensor( self->graph,
-        inputs[0], shape, new_rank );
+    tmp_tensors[0] = vsi_nn_kernel_insert_reshape_node( self->graph,
+        inputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_BACKWARD );
     tmp_tensors[1] = inputs[1];
     tmp_tensors[2] = inputs[2];
-    tmp_tensors[3] = vsi_nn_reshape_tensor( self->graph,
-            outputs[0], shape, new_rank );
+    tmp_tensors[3] = vsi_nn_kernel_insert_reshape_node( self->graph,
+            outputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_FORWARD );
 
     status = _try_set_high_presision_tensor(tmp_tensors);
-    if(status != VSI_SUCCESS)
+    if (status != VSI_SUCCESS)
     {
         VSILOGE("Set tensor attr of high presision fail");
         return status;
@@ -150,7 +150,7 @@ static vsi_status op_compute
 
     n = vsi_nn_kernel_selector( self->graph, "instance_norm",
                     tmp_tensors, _INPUT_NUM, &tmp_tensors[3], _OUTPUT_NUM, param );
-    if( n != NULL )
+    if ( n != NULL )
     {
         self->n = (vx_node)n;
         status = VSI_SUCCESS;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
index a90ae59..fe22781 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
@@ -55,10 +55,12 @@ static vsi_status op_compute
     float eps = self->nn_param.layernorm.eps;
     int32_t axis = self->nn_param.layernorm.axis;
 
+#if (!VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
     if ( self->nn_param.layernorm.local->use_internal_node )
     {
         return vsi_nn_internal_compute_node( self );
     }
+#endif
 
     param = vsi_nn_kernel_param_create();
 
@@ -88,14 +90,18 @@ static vsi_bool op_setup
     )
 {
     vsi_bool ret = TRUE;
+
+#if (!VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
     int32_t axis = 0;
     vsi_nn_internal_node_t* curr = NULL;
+#endif
 
     if ( NULL == self )
     {
         return FALSE;
     }
 
+#if (!VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
     axis = self->nn_param.layernorm.axis;
 
     vsi_nn_internal_init_node_wksp( self );
@@ -147,11 +153,14 @@ static vsi_bool op_setup
         ret = vsi_nn_internal_setup_node( self, curr );
     }
     else
+#endif
     {
         ret = vsi_nn_op_common_setup(self, inputs, outputs);
     }
 
+#if (!VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
 final:
+#endif
     return ret;
 }
 
@@ -236,9 +245,11 @@ static vsi_status op_init
 
     self->nn_param.layernorm.axis = 0;
 
+#if (!VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
     self->nn_param.layernorm.local = (vsi_nn_layernorm_lcl_data *)malloc(sizeof(vsi_nn_layernorm_lcl_data));
     memset(self->nn_param.layernorm.local, 0x00, sizeof(vsi_nn_layernorm_lcl_data));
     self->nn_param.layernorm.local->use_internal_node = FALSE;
+#endif
 
     return status;
 }
@@ -250,7 +261,9 @@ static vsi_status op_deinit
 {
     vsi_nn_safe_free(self->nn_param.layernorm.local);
 
+#if (!VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
     vsi_nn_internal_deinit_node_wksp( self );
+#endif
 
     vsi_nn_op_common_deinit(self);
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c
index 34c329c..377ba26 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c
@@ -46,14 +46,9 @@ static vsi_status _log_softmax_op_compute
     )
 {
     vsi_status status;
-    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
-    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
-    uint32_t rank_in = 0;
     int32_t axis = 0;
-    int32_t new_axis = 0;
     float betaValue = 0;
 
-    vsi_bool ret;
     vsi_nn_kernel_param_t * param = NULL;
     vsi_nn_log_softmax_param * p = NULL;
 
@@ -69,33 +64,19 @@ static vsi_status _log_softmax_op_compute
 
     // TODO: This optimzie is a hack for gpu path,
     // it should be moved to gpu kernel setup.
-    ret = vsi_nn_kernel_optimize_softmax_shape(
-            inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
-            shapes[0], &rank_in, &new_axis);
 
-    if( ret )
-    {
-        // Add params
-         param =vsi_nn_kernel_param_create();
+    param =vsi_nn_kernel_param_create();
 
-         vsi_nn_kernel_param_add_int32( param, "axis", new_axis );
-         vsi_nn_kernel_param_add_float32( param, "beta", betaValue );
+    vsi_nn_kernel_param_add_int32( param, "axis", axis );
+    vsi_nn_kernel_param_add_float32( param, "beta", betaValue );
 
-        reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
-                inputs[0], shapes[0], rank_in );
-        reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
-                outputs[0], shapes[0], rank_in );
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+            kernel_name,
+            inputs, 1,
+            outputs, 1, param );
 
-        self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
-                kernel_name,
-                &reshape_tensors[0], 1,
-                &reshape_tensors[1], 1, param );
+    vsi_nn_kernel_param_release( &param );
 
-        vsi_nn_ReleaseTensor( &reshape_tensors[0] );
-        vsi_nn_ReleaseTensor( &reshape_tensors[1] );
-
-        vsi_nn_kernel_param_release( &param );
-    }
     if( self->n )
     {
         status = VSI_SUCCESS;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c
index ebd17a3..65f22a3 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c
@@ -224,6 +224,7 @@ static vsi_bool op_setup
     uint32_t batch_size = 0;
     uint32_t time_step = 0;
     uint32_t i = 0;
+    size_t k = 0;
     vsi_bool ret = FALSE;
     vsi_status status = VSI_FAILURE;
 
@@ -329,6 +330,17 @@ static vsi_bool op_setup
         curr->node->nn_param.lstmunit_ovxlib.forget_bias = curr_param->forget_bias;
         curr->node->nn_param.lstmunit_ovxlib.proj_clip = curr_param->proj_clip;
         curr->node->nn_param.lstmunit_ovxlib.recurrent_activation = curr_param->recurrent_activation;
+        if ( reshape_output->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ||
+             reshape_output->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 )
+        {
+            for (k = 0; k < _cnt_of_array( curr_param->internal_dtype ); k++)
+            {
+                if (curr_param->internal_dtype[k].vx_type == VSI_NN_TYPE_NONE)
+                {
+                    curr_param->internal_dtype[k] = reshape_output->attr.dtype;
+                }
+            }
+        }
         memcpy( curr->node->nn_param.lstmunit_ovxlib.internal_dtype,
             curr_param->internal_dtype, sizeof( curr_param->internal_dtype ) );
         curr->inputs[LSTMUNIT_INPUT_INPUT] = reshape_output;
@@ -361,6 +373,21 @@ static vsi_bool op_setup
         curr->inputs[LSTMUNIT_INPUT_LAYERNORM_C] = inputs[LSTM_INPUT_LAYERNORM_C];
         curr->inputs[LSTMUNIT_INPUT_LAYERNORM_O] = inputs[LSTM_INPUT_LAYERNORM_O];
 
+        if (self->input.num > LSTM_INPUT_BIAS_R2I)
+        {
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2I] = inputs[LSTM_INPUT_BIAS_R2I];
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2F] = inputs[LSTM_INPUT_BIAS_R2F];
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2C] = inputs[LSTM_INPUT_BIAS_R2C];
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2O] = inputs[LSTM_INPUT_BIAS_R2O];
+        }
+        else
+        {
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2I] = NULL;
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2F] = NULL;
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2C] = NULL;
+            curr->inputs[LSTMUNIT_INPUT_BIAS_R2O] = NULL;
+        }
+
         curr->outputs[LSTMUNIT_OUTPUT_OUTPUT] = lstmunit_out0;
         curr->outputs[LSTMUNIT_OUTPUT_H_STATE] = lstmunit_out1;
         curr->outputs[LSTMUNIT_OUTPUT_C_STATE] = lstmunit_out2;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c
index 4bf4443..755c63d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c
@@ -303,7 +303,7 @@ static vsi_bool op_setup
     vsi_nn_internal_tensor_t* input_add_aux_input_fc_outputs[LSTMUNIT_IFCO_GATE_COUNT] = { NULL };
     vsi_nn_internal_tensor_t* recurrent_fc_outputs[LSTMUNIT_IFCO_GATE_COUNT] = { NULL };
     vsi_nn_internal_tensor_t* layernorm_outputs[LSTMUNIT_IFCO_GATE_COUNT] = { NULL };
-    vsi_nn_tensor_t* bias_tensors[LSTMUNIT_IFCO_GATE_COUNT] = { NULL };
+    vsi_nn_tensor_t* bias_tensors[LSTMUNIT_IFCO_GATE_COUNT * 2] = { NULL };
     vsi_nn_tensor_t* zero_bias_tensor = NULL;
     vsi_nn_internal_node_t* curr = NULL;
     int32_t ifco_start_index = 0;
@@ -362,7 +362,7 @@ static vsi_bool op_setup
 
     setup_op_shapes(self, inputs, outputs);
 
-    for( i = 0; i < LSTMUNIT_IFCO_GATE_COUNT; i++)
+    for( i = 0; i < LSTMUNIT_IFCO_GATE_COUNT * 2; i++)
     {
         if( p->local->use_layer_norm || p->local->use_hybrid )
         {
@@ -370,7 +370,18 @@ static vsi_bool op_setup
         }
         else
         {
-            bias_tensors[i] = inputs[LSTMUNIT_INPUT_BIAS_I + i];
+            if(i < LSTMUNIT_IFCO_GATE_COUNT)
+            {
+                 bias_tensors[i] = inputs[LSTMUNIT_INPUT_BIAS_I + i];
+            }
+            else if(self->input.num > LSTM_INPUT_BIAS_R2I)
+            {
+                 bias_tensors[i] = inputs[LSTMUNIT_INPUT_BIAS_R2I + i - LSTMUNIT_IFCO_GATE_COUNT];
+            }
+            else
+            {
+                 bias_tensors[i] = NULL;
+            }
         }
     }
 
@@ -486,7 +497,7 @@ static vsi_bool op_setup
             recurrent_fc_outputs[i] = create_tp_fc(self,
                                                 inputs[LSTMUNIT_INPUT_H_STATE],
                                                 inputs[LSTMUNIT_INPUT_WEIGHT_R2I + i],
-                                                NULL,
+                                                bias_tensors[LSTMUNIT_IFCO_GATE_COUNT + i],
                                                 &p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_R2I + i],
                                                 use_virtual_tensor);
             CHECK_PTR_FAIL_GOTO( recurrent_fc_outputs[i], "Create tensor fail.", final );
@@ -506,7 +517,7 @@ static vsi_bool op_setup
             vsi_nn_internal_tensor_t* tmp = create_nn_fc(self,
                                                 recurrent_input_tensor->t,
                                                 inputs[LSTMUNIT_INPUT_WEIGHT_R2I + i],
-                                                NULL,
+                                                bias_tensors[LSTMUNIT_IFCO_GATE_COUNT + i],
                                                 kernel_h, kernel_w,
                                                 &p->internal_dtype[LSTMUNIT_QUANTIZE_PARAM_R2I + i],
                                                 use_virtual_tensor);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c
index 146ee33..5e91fbc 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c
@@ -37,6 +37,7 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_constraint_check.h"
 
+#if !(VX_TRANSPOSE_OPT_SUPPORT)
 static vsi_bool _is_same_memory_shape
     (
     vsi_nn_node_t   * self,
@@ -116,6 +117,7 @@ static vsi_bool _is_same_quant
 
     return TRUE;
 } /* _is_same_quant */
+#endif
 
 static vsi_status op_compute
     (
@@ -242,6 +244,14 @@ static vsi_status op_optimize
 
     status = VSI_SUCCESS;
 
+#if (VX_TRANSPOSE_OPT_SUPPORT)
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+    VSI_UNREFERENCED(direction);
+    self->nn_param.permute.local.initialized = FALSE;
+
+    return status;
+#else
     if (_is_same_memory_shape(self, inputs, outputs) == FALSE ||
         _is_same_quant(self, inputs, outputs) == FALSE ||
         (inputs[0]->t != NULL && outputs[0]->t != NULL))
@@ -285,6 +295,7 @@ static vsi_status op_optimize
     }
 
     return status;
+#endif
 } /* op_optimize() */
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
index dea1770..682628c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
@@ -132,7 +132,9 @@ static vsi_bool op_setup
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422       ||
-         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422       ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12_RGGB       ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV21_BGGR
         )
     {
         uint32_t i = 0;
@@ -487,6 +489,8 @@ static vsi_bool op_setup
         break;
     case VSI_NN_SOURCE_FORMAT_IMAGE_NV21:
     case VSI_NN_SOURCE_FORMAT_IMAGE_NV12:
+    case VSI_NN_SOURCE_FORMAT_IMAGE_NV21_BGGR:
+    case VSI_NN_SOURCE_FORMAT_IMAGE_NV12_RGGB:
         {
             curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_NV12, 0, 0 );
             CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
@@ -514,10 +518,18 @@ static vsi_bool op_setup
             {
                 curr->node->nn_param.pre_process_nv12.nv_type = VSI_NN_YUV_TYPE_NV12;
             }
-            else
+            else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV21)
             {
                 curr->node->nn_param.pre_process_nv12.nv_type = VSI_NN_YUV_TYPE_NV21;
             }
+            else if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12_RGGB)
+            {
+                curr->node->nn_param.pre_process_nv12.nv_type = VSI_NN_YUV_TYPE_NV12_RGGB;
+            }
+            else
+            {
+                curr->node->nn_param.pre_process_nv12.nv_type = VSI_NN_YUV_TYPE_NV21_BGGR;
+            }
 
             curr->node->nn_param.pre_process_nv12.reverse_channel = p->reverse_channel;
             curr->node->nn_param.pre_process_nv12.rect.left = p->rect.left;
@@ -618,7 +630,9 @@ static vsi_bool op_setup
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_BGRA          ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_GRAY          ||
          (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR && !enable_rgb88_planar_nhwc) ||
-         (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP && !enable_rgb88_planar_nhwc)
+         (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP && !enable_rgb88_planar_nhwc) ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12_RGGB     ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV21_BGGR
         )
     {
         if (layout == VSI_NN_DEST_LAYOUT_NHWC)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c
index 7fa635a..f02fff9 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c
@@ -63,7 +63,15 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_nv12.local->enable_perm );
     vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_nv12.local->enable_copy );
     vsi_nn_kernel_param_add_int32( param, "nv_type", self->nn_param.pre_process_nv12.nv_type );
-    n = vsi_nn_kernel_selector( self->graph, "pre_process_nv12", inputs, 2, outputs, 1, param );
+    if (self->nn_param.pre_process_nv12.nv_type == VSI_NN_YUV_TYPE_NV12 ||
+        self->nn_param.pre_process_nv12.nv_type == VSI_NN_YUV_TYPE_NV21)
+    {
+        n = vsi_nn_kernel_selector( self->graph, "pre_process_nv12", inputs, 2, outputs, 1, param );
+    }
+    else
+    {
+        n = vsi_nn_kernel_selector( self->graph, "pre_process_nv12_rggb", inputs, 2, outputs, 1, param );
+    }
     if( n != NULL )
     {
         self->n = (vx_node)n;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c
index c203fdd..095d4d6 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_quantized_16bit_lstm.c
@@ -35,65 +35,8 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 #include "vsi_nn_internal_node.h"
 
-static vsi_status op_compute
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    vsi_status status;
-    status = VSI_FAILURE;
-
-    /* TODO */
-    /* example code : add op */
-    /*
-    self->n = vxTensorAddNode( self->graph->g, inputs[0]->t, inputs[1]->t,
-        VX_CONVERT_POLICY_SATURATE, outputs[0]->t );
-    */
-
-    VSI_UNREFERENCED(inputs);
-    VSI_UNREFERENCED(outputs);
-
-    if( NULL != self->n )
-    {
-        status = VSI_SUCCESS;
-    }
-    return status;
-} /* op_compute() */
-
-static vsi_bool op_check
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    VSI_UNREFERENCED(self);
-    VSI_UNREFERENCED(inputs);
-    VSI_UNREFERENCED(outputs);
-    /*TODO: Check tensor shapes. */
-    return TRUE;
-} /* op_check() */
-
-static vsi_bool op_setup
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs
-    )
-{
-    VSI_UNREFERENCED(self);
-    VSI_UNREFERENCED(inputs);
-    VSI_UNREFERENCED(outputs);
-
-    /* TODO: Add code to comput outputs' shape. */
-    return TRUE;
-} /* op_setup() */
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -102,10 +45,10 @@ DEF_OP_REG
     (
     /* op_name    */ QUANTIZED_16BIT_LSTM,
     /* init       */ NULL,
-    /* compute    */ op_compute,
-    /* deinit     */ vsi_nn_op_common_deinit,
-    /* check      */ op_check,
-    /* setup      */ op_setup,
+    /* compute    */ NULL,
+    /* deinit     */ NULL,
+    /* check      */ NULL,
+    /* setup      */ NULL,
     /* optimize   */ NULL,
     /* input_num  */ Q16_LSTM_INPUT_CNT,
     /* output_num */ Q16_LSTM_OUTPUT_CNT
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
index 523eeb4..418c6a0 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
@@ -55,7 +55,7 @@ static vsi_status op_compute
         self->nn_param.reshape.local.initialized == FALSE)
     {
         vsi_status status = VSI_SUCCESS;
-#ifdef VX_REMOVE_RESHAPE_SUPPORT
+#if VX_REMOVE_RESHAPE_SUPPORT
         vsi_nn_tensor_attr_t attr;
         vsi_nn_tensor_t *dims_tensor = NULL;
         vx_nn_reshape_params_t reshape_param;
@@ -147,8 +147,11 @@ static vsi_status op_optimize
     vsi_status status;
 
     status = VSI_SUCCESS;
-#ifdef VX_REMOVE_RESHAPE_SUPPORT
+#if VX_REMOVE_RESHAPE_SUPPORT
     self->nn_param.reshape.local.initialized = FALSE;
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+    VSI_UNREFERENCED(direction);
 #else
     if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
index 4395961..6e1c313 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
@@ -53,7 +53,7 @@ static vsi_status op_compute
     if (inputs[0]->t != NULL && outputs[0]->t != NULL &&
         self->nn_param.reshape2.local->initialized == FALSE)
     {
-#ifdef VX_REMOVE_RESHAPE_SUPPORT
+#if VX_REMOVE_RESHAPE_SUPPORT
         vsi_nn_tensor_attr_t attr;
         vsi_nn_tensor_t *dims_tensor = NULL;
         vx_nn_reshape_params_t reshape_param;
@@ -179,8 +179,11 @@ static vsi_status op_optimize
     vsi_status status;
 
     status = VSI_SUCCESS;
-#ifdef VX_REMOVE_RESHAPE_SUPPORT
+#if VX_REMOVE_RESHAPE_SUPPORT
     self->nn_param.reshape2.local->initialized = FALSE;
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+    VSI_UNREFERENCED(direction);
 #else
     if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
index 1a719af..97fad8f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
@@ -105,6 +105,10 @@ static vsi_status op_compute
                  snprintf(kernel_name, sizeof(kernel_name),
                  "resize_bilinear");
                  break;
+            case VSI_NN_INTERPOLATION_CUBIC:
+                 snprintf(kernel_name, sizeof(kernel_name),
+                 "resize_cubic");
+                 break;
             default:
                 break;
         }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c
index e3e19ad..f6721b6 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c
@@ -53,6 +53,7 @@ static vsi_status op_compute
     uint32_t idx_num = 1;
     vsi_size_t *input_size = inputs[2]->attr.size;
     uint32_t dims_num = inputs[2]->attr.dim_num;
+    vsi_nn_reduction_type_e reduction = self->nn_param.scatter_nd_update.reduction;
 
     if (inputs[1]->attr.dim_num > 1)
     {
@@ -75,7 +76,17 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_int32( param, "block_size", block_size );
     vsi_nn_kernel_param_add_int32( param, "coord_dim", coord_dim );
     vsi_nn_kernel_param_add_int32( param, "idx_num", idx_num );
-    n = vsi_nn_kernel_selector( self->graph, "scatter_nd_update", inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
+    vsi_nn_kernel_param_add_int32( param, "reduction", reduction );
+    if (reduction > VSI_NN_REDUCTION_TYPE_NONE)
+    {
+        n = vsi_nn_kernel_selector( self->graph, "scatter_nd_update_reduction",
+                    inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
+    }
+    else
+    {
+        n = vsi_nn_kernel_selector( self->graph, "scatter_nd_update",
+                    inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
+    }
     if ( n != NULL )
     {
         self->n = (vx_node)n;
@@ -155,6 +166,18 @@ static vsi_bool op_setup
     return TRUE;
 } /* op_setup() */
 
+static vsi_status op_init
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    self->nn_param.scatter_nd_update.reduction = VSI_NN_REDUCTION_TYPE_NONE;
+
+    return status;
+} /* op_init() */
+
 static vsi_status op_deinit
     (
     vsi_nn_node_t * self
@@ -172,7 +195,7 @@ extern "C" {
 DEF_OP_REG
     (
     /* op_name    */ SCATTER_ND_UPDATE,
-    /* init       */ NULL,
+    /* init       */ op_init,
     /* compute    */ op_compute,
     /* deinit     */ op_deinit,
     /* check      */ op_check,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
index d6e6e90..84c2dd7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
@@ -572,6 +572,10 @@ static vsi_bool _build_strided_slice_params(vsi_nn_strided_slice_param * op_para
     const int32_t *stride_dims = op_params->stride_dims;
     strided_slice_param *params = &op_params->lcl2_data->params;
 
+    params->begin_dims_num = 0;
+    params->end_dims_num = 0;
+    params->stride_dims_num = 0;
+
     begin_mask = _reverse_mask_bits(begin_mask, input_dims);
     end_mask   = _reverse_mask_bits(end_mask, input_dims);
     shrink_axis_mask = _reverse_mask_bits(shrink_axis_mask, input_dims);
@@ -762,7 +766,8 @@ static vsi_status op_optimize
     vsi_size_t     output_elements = 0;
 
     /* Only forward run stride_slice's optimize */
-    if ( direction == VSI_NN_OPTIMIZE_BACKWARD )
+    if ( direction == VSI_NN_OPTIMIZE_BACKWARD ||
+         !self->graph->ctx->options.enable_slice_optimize )
     {
         return status;
     }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
index ff8c0e0..0be22cd 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
@@ -111,6 +111,11 @@ static vsi_status op_compute
     vsi_nn_tensor_t * out1_tensor = NULL;
     vsi_bool ret = FALSE;
 
+    if (inputs[0]->attr.size[axis] == 1)
+    {
+        return vsi_nn_internal_compute_node( self );
+    }
+
     ret = vsi_nn_kernel_optimize_softmax_shape(
             inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
             shapes[0], &rank_in, &new_axis0);
@@ -259,13 +264,12 @@ static vsi_bool op_setup
 {
     /* TODO: Add code to comput outputs' shape. */
     uint32_t i;
+    vsi_nn_topk_param * p;
+
+    p = &(self->nn_param.topk);
 
     if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
-        vsi_nn_topk_param * p;
-
-        p = &(self->nn_param.topk);
-
         outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
         outputs[0]->attr.size[p->axis] = p->k;
         for (i = 0; i < inputs[0]->attr.dim_num; i++)
@@ -280,10 +284,6 @@ static vsi_bool op_setup
 
     if ( VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num )
     {
-        vsi_nn_topk_param * p;
-
-        p = &(self->nn_param.topk);
-
         outputs[1]->attr.dim_num = inputs[0]->attr.dim_num;
         outputs[1]->attr.size[p->axis] = p->k;
         for (i = 0; i < inputs[0]->attr.dim_num; i++)
@@ -296,9 +296,58 @@ static vsi_bool op_setup
         }
     }
 
+    if (inputs[0]->attr.size[p->axis] == 1)
+    {
+        vsi_nn_internal_node_t* curr = NULL;
+        vsi_nn_internal_tensor_t* const0_input = NULL;
+        vsi_nn_tensor_attr_t attr;
+
+        vsi_nn_internal_init_node_wksp(self);
+        curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+        curr->inputs[0]  = inputs[0];
+        curr->outputs[0] = outputs[0];
+        vsi_nn_internal_setup_node(self, curr);
+
+        memcpy(&attr, &outputs[1]->attr, sizeof(vsi_nn_tensor_attr_t));
+        attr.vtl = FALSE;
+        attr.is_const = TRUE;
+
+        const0_input = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        CHECK_PTR_FAIL_GOTO(const0_input, "Create tensor failed", final);
+        curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);
+        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
+        curr->inputs[0]  = const0_input->t;
+        curr->outputs[0] = outputs[1];
+        vsi_nn_internal_setup_node(self, curr);
+    }
+
     return TRUE;
+final:
+    return FALSE;
 } /* op_setup() */
 
+static vsi_status op_optimize
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    vsi_nn_opt_direction_e direction
+    )
+{
+    vsi_nn_topk_param * p;
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+
+    p = &(self->nn_param.topk);
+    if (inputs[0]->attr.size[p->axis] == 1)
+    {
+        return vsi_nn_internal_optimize_node( self, direction );
+    }
+
+    return VSI_SUCCESS;
+} /* op_optimize() */
+
 static vsi_status op_init
     (
     vsi_nn_node_t * self
@@ -310,6 +359,17 @@ static vsi_status op_init
     return status;
 } /* op_init() */
 
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_nn_internal_deinit_node_wksp(self);
+    vsi_nn_op_common_deinit(self);
+
+    return VSI_SUCCESS;
+} /* op_deinit() */
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -319,10 +379,10 @@ DEF_OP_REG
     /* op_name    */ TOPK,
     /* init       */ op_init,
     /* compute    */ op_compute,
-    /* deinit     */ vsi_nn_op_common_deinit,
+    /* deinit     */ op_deinit,
     /* check      */ op_check,
     /* setup      */ op_setup,
-    /* optimize   */ NULL,
+    /* optimize   */ op_optimize,
     /* input_num  */ _INPUT_NUM,
     /* output_num */ _OUTPUT_NUM
     );
diff --git a/src/tim/vx/internal/src/quantization/vsi_nn_asymmetric_affine.c b/src/tim/vx/internal/src/quantization/vsi_nn_asymmetric_affine.c
index 1025604..2088eba 100644
--- a/src/tim/vx/internal/src/quantization/vsi_nn_asymmetric_affine.c
+++ b/src/tim/vx/internal/src/quantization/vsi_nn_asymmetric_affine.c
@@ -75,10 +75,13 @@ vsi_bool vsi_nn_QuantAffineCheck
 
     switch (dtype)
     {
+        case VSI_NN_TYPE_UINT4:
         case VSI_NN_TYPE_UINT8:
         case VSI_NN_TYPE_UINT16:
         case VSI_NN_TYPE_UINT32:
+        case VSI_NN_TYPE_INT4:
         case VSI_NN_TYPE_INT8:
+        case VSI_NN_TYPE_INT16:
         {
             double product_scale = (double)input->attr.dtype.scale * (double)weight->attr.dtype.scale;
             const double acuity_round_decimals = 1e-8;
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
index e862b9a..eb02639 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
@@ -467,6 +467,8 @@ static _op_param_gen_t s_op_gen[] =
     /* LPNORM */                NULL,
     /* RESIZE_3D */             NULL,
     /* REDUCEL2 */              NULL,
+    /* CROP_AND_RESIZE */       NULL,
+    /* TAN */                   NULL,
 };
 _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );
 
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_util.c b/src/tim/vx/internal/src/utils/vsi_nn_util.c
index 82d1aaa..6f91f99 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_util.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c
@@ -152,6 +152,33 @@ char* vsi_nn_getenv
     return var;
 };
 
+int32_t vsi_nn_getenv_asint
+    (
+        const char* env,
+        int32_t default_value
+    )
+{
+    int32_t value = default_value;
+    #ifdef __ANDROID__
+    {
+        char value_str[100];
+        int32_t status = __system_property_get(env, value_str);
+        if (status) {
+            value = atoi(value_str);
+        }
+    }
+    #else
+    {
+        char* env_s = vsi_nn_getenv(env);
+        if (env_s) {
+            value = atoi(env_s);
+        }
+    }
+    #endif
+
+    return value;
+}
+
 FILE* vsi_nn_fopen
     (
     const char * file_name,
diff --git a/src/tim/vx/internal/src/vip/virtual_device.cpp b/src/tim/vx/internal/src/vip/virtual_device.cpp
index 3568f69..0635843 100644
--- a/src/tim/vx/internal/src/vip/virtual_device.cpp
+++ b/src/tim/vx/internal/src/vip/virtual_device.cpp
@@ -227,6 +227,10 @@ uint32_t IDevice::Id() const{
     return device_->Id();
 }
 
+bool IDevice::GraphSubmit(vsi_nn_graph_t* graph, bool (*func)(const void*), data_t data) {
+    return device_->GraphSubmit(graph, func_t(func), data);
+}
+
 bool IDevice::GraphSubmit(vsi_nn_graph_t* graph, func_t func, data_t data) {
     return device_->GraphSubmit(graph, func, data);
 }
diff --git a/src/tim/vx/internal/src/vsi_nn_context.c b/src/tim/vx/internal/src/vsi_nn_context.c
index 99a5e79..fa58045 100644
--- a/src/tim/vx/internal/src/vsi_nn_context.c
+++ b/src/tim/vx/internal/src/vsi_nn_context.c
@@ -93,70 +93,53 @@ final:
     return status;
 }
 
-int32_t vsi_nn_getEnv(const char* name, char** env_s) {
-    int32_t ret = 0;
-    *env_s = vsi_nn_getenv(name);
-    if (*env_s) {
-        ret = TRUE;
-    }
-    return ret;
-}
-
+#if (defined(__ANDROID__)) && (ANDROID_SDK_VERSION >= 30)
+static const char* ENV_ENABLE_SHADER = "vendor.VIV_VX_ENABLE_SHADER";
+static const char* ENV_ENABLE_OPCHECK = "vendor.VSI_NN_ENABLE_OPCHECK";
+static const char* ENV_ENABLE_CONCAT_OPTIMIZE = "vendor.VSI_NN_ENABLE_CONCAT_OPTIMIZE";
+static const char* ENV_ENABLE_I8TOU8 = "vendor.VSI_NN_ENABLE_I8TOU8";
+static const char* ENV_ENABLE_DATACONVERT_OPTIMIZE = "vendor.VSI_NN_ENABLE_DATACONVERT_OPTIMIZE";
+static const char* ENV_ENABLE_STREAM_PROCESSOR = "vendor.VSI_VX_ENABLE_STREAM_PROCESSOR";
+static const char* ENV_FORCE_RGB888_OUT_NHWC = "vendor.VSI_NN_FORCE_RGB888_OUT_NHWC";
+static const char* ENV_ENABLE_SLICE_OPTIMIZE = "vendor.VSI_NN_ENABLE_SLICE_OPTIMIZE";
+static const char* ENV_ENABLE_BATCH_OPT = "vendor.VSI_VX_ENABLE_BATCH_OPT";
+#else
+static const char* ENV_ENABLE_SHADER = "VIV_VX_ENABLE_SHADER";
+static const char* ENV_ENABLE_OPCHECK = "VSI_NN_ENABLE_OPCHECK";
+static const char* ENV_ENABLE_CONCAT_OPTIMIZE = "VSI_NN_ENABLE_CONCAT_OPTIMIZE";
+static const char* ENV_ENABLE_I8TOU8 = "VSI_NN_ENABLE_I8TOU8";
+static const char* ENV_ENABLE_DATACONVERT_OPTIMIZE = "VSI_NN_ENABLE_DATACONVERT_OPTIMIZE";
+static const char* ENV_ENABLE_STREAM_PROCESSOR = "VSI_VX_ENABLE_STREAM_PROCESSOR";
+static const char* ENV_FORCE_RGB888_OUT_NHWC = "VSI_NN_FORCE_RGB888_OUT_NHWC";
+static const char* ENV_ENABLE_SLICE_OPTIMIZE = "VSI_NN_ENABLE_SLICE_OPTIMIZE";
+static const char* ENV_ENABLE_BATCH_OPT = "VSI_VX_ENABLE_BATCH_OPT";
+#endif
 static vsi_status vsi_nn_initOptions
     (
     vsi_nn_runtime_option_t *options
     )
 {
-    char* env_s = NULL;
+    int32_t default_value = 1;
 
-    env_s = NULL;
-    options->enable_shader = 1;
-    if (vsi_nn_getEnv("VIV_VX_ENABLE_SHADER", &env_s) && env_s)
-    {
-        options->enable_shader = atoi(env_s);
-    }
-
-    env_s = NULL;
-    options->enable_opcheck = 1;
-    if (vsi_nn_getEnv("VSI_NN_ENABLE_OPCHECK", &env_s) && env_s)
-    {
-        options->enable_opcheck = atoi(env_s);
-    }
-
-    env_s = NULL;
-    options->enable_concat_optimize = 1;
-    if (vsi_nn_getEnv("VSI_NN_ENABLE_CONCAT_OPTIMIZE", &env_s) && env_s)
-    {
-        options->enable_concat_optimize = atoi(env_s);
-    }
-
-    env_s = NULL;
-    options->enable_asymi8_to_u8 = 1;
-    if (vsi_nn_getEnv("VSI_NN_ENABLE_I8TOU8", &env_s) && env_s)
-    {
-        options->enable_asymi8_to_u8 = atoi(env_s);
-    }
-
-    env_s = NULL;
-    options->enable_dataconvert_optimize = 1;
-    if (vsi_nn_getEnv("VSI_NN_ENABLE_DATACONVERT_OPTIMIZE", &env_s) && env_s)
-    {
-        options->enable_dataconvert_optimize = atoi(env_s);
-    }
-
-    env_s = NULL;
-    options->enable_stream_processor = 1;
-    if (vsi_nn_getEnv("VSI_VX_ENABLE_STREAM_PROCESSOR", &env_s) && env_s)
-    {
-        options->enable_stream_processor = atoi(env_s);
-    }
-
-    env_s = NULL;
-    options->enable_rgb88_planar_nhwc = 0;
-    if (vsi_nn_getEnv("VSI_NN_FORCE_RGB888_OUT_NHWC", &env_s) && env_s)
-    {
-        options->enable_rgb88_planar_nhwc = atoi(env_s);
-    }
+    options->enable_shader = vsi_nn_getenv_asint(ENV_ENABLE_SHADER, 1);
+    options->enable_opcheck = vsi_nn_getenv_asint(ENV_ENABLE_OPCHECK, 1);
+#if (VX_CONCAT_OPT_SUPPORT)
+    default_value = 0;
+#else
+    default_value = 1;
+#endif
+    options->enable_concat_optimize = vsi_nn_getenv_asint(ENV_ENABLE_CONCAT_OPTIMIZE, default_value);
+    options->enable_asymi8_to_u8 = vsi_nn_getenv_asint(ENV_ENABLE_I8TOU8, 1);
+    options->enable_dataconvert_optimize = vsi_nn_getenv_asint(ENV_ENABLE_DATACONVERT_OPTIMIZE, 1);
+    options->enable_stream_processor = vsi_nn_getenv_asint(ENV_ENABLE_STREAM_PROCESSOR, 1);
+    options->enable_rgb88_planar_nhwc = vsi_nn_getenv_asint(ENV_FORCE_RGB888_OUT_NHWC, 0);
+#if (VX_STRIDED_SLICE_OPT_SUPPORT)
+    default_value = 0;
+#else
+    default_value = 1;
+#endif
+    options->enable_slice_optimize = vsi_nn_getenv_asint(ENV_ENABLE_SLICE_OPTIMIZE, default_value);
+    options->enable_batch_opt = vsi_nn_getenv_asint(ENV_ENABLE_BATCH_OPT, 0);
 
     return VSI_SUCCESS;
 }
diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c
index 9954d5d..ded1835 100644
--- a/src/tim/vx/internal/src/vsi_nn_graph.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph.c
@@ -560,6 +560,692 @@ final:
     return status;
 } /* setup_node() */
 
+#if VX_GRAPH_BATCH_OPT_SUPPORT
+static vsi_bool canBatchSplit
+(
+    vsi_nn_node_t* node,
+    uint32_t inputBtachNum
+)
+{
+    vsi_bool ret;
+    uint32_t i;
+    ret = TRUE;
+
+    switch(node->op)
+    {
+        case VSI_NN_OP_SOFTMAX:
+            if (node->nn_param.softmax.axis == (int32_t)inputBtachNum - 1)
+            {
+                ret = FALSE;
+            }
+            break;
+        case VSI_NN_OP_LOG_SOFTMAX:
+            if (node->nn_param.log_softmax.axis == (int32_t)inputBtachNum - 1)
+            {
+                ret = FALSE;
+            }
+            break;
+        case VSI_NN_OP_LAYER_NORM:
+            if (node->nn_param.layernorm.axis == (int32_t)inputBtachNum - 1)
+            {
+                ret = FALSE;
+            }
+            break;
+        case VSI_NN_OP_REDUCE:
+            for (i = 0; i < node->nn_param.reduce.axis_num; i++)
+            {
+                int index = node->nn_param.reduce.axis[i];
+                if (index == (int32_t)inputBtachNum - 1)
+                {
+                    ret = FALSE;
+                    break;
+                }
+            }
+            break;
+        case VSI_NN_OP_CONCAT:
+            if (node->nn_param.concat.axis == inputBtachNum - 1)
+            {
+                ret = FALSE;
+            }
+            break;
+        case VSI_NN_OP_TENSORSTACKCONCAT:
+            if (node->nn_param.tensorstackconcat.axis == (int32_t)inputBtachNum - 1)
+            {
+                ret = FALSE;
+            }
+            break;
+        case VSI_NN_OP_STACK:
+            if (node->nn_param.stack.axis == inputBtachNum - 1)
+            {
+                ret = FALSE;
+            }
+            break;
+        case VSI_NN_OP_UNSTACK:
+            if (node->nn_param.unstack.axis == inputBtachNum - 1)
+            {
+                ret = FALSE;
+            }
+            break;
+        case VSI_NN_OP_CONCATSHIFT:
+            if (node->nn_param.concatshift.axis == inputBtachNum - 1)
+            {
+                ret = FALSE;
+            }
+            break;
+        case VSI_NN_OP_SPLIT:
+            if (node->nn_param.split.axis == inputBtachNum - 1)
+            {
+                ret = FALSE;
+            }
+            break;
+        case VSI_NN_OP_BATCH2SPACE:
+        case VSI_NN_OP_SPACE2BATCH:
+        case VSI_NN_OP_BATCH_NORM:
+            ret = FALSE;
+            break;
+        case VSI_NN_OP_CROP:
+            if (node->nn_param.crop.axis == (int32_t)inputBtachNum - 1)
+            {
+                ret = FALSE;
+            }
+            break;
+        case VSI_NN_OP_CUMSUM:
+            if (node->nn_param.cumsum.axis == (int32_t)inputBtachNum - 1)
+            {
+                ret = FALSE;
+            }
+            break;
+        case VSI_NN_OP_INSTANCE_NORM:
+            for (i = 0; i < (uint32_t)node->nn_param.instancenorm.axis_num; i++)
+            {
+                int index = node->nn_param.instancenorm.axis[i];
+                if (index == (int32_t)inputBtachNum - 1)
+                {
+                    ret = FALSE;
+                    break;
+                }
+            }
+            break;
+        case VSI_NN_OP_L2NORMALIZESCALE:
+            if (node->nn_param.l2normalizescale.axis == (int32_t)inputBtachNum - 1)
+            {
+                ret = FALSE;
+            }
+            break;
+        case VSI_NN_OP_L2_NORMALIZE:
+            if (node->nn_param.l2_normalize.axis == (int32_t)inputBtachNum - 1)
+            {
+                ret = FALSE;
+            }
+            break;
+        case VSI_NN_OP_LPNORM:
+            if (node->nn_param.lpnorm.axis == (int32_t)inputBtachNum - 1)
+            {
+                ret = FALSE;
+            }
+            break;
+        case VSI_NN_OP_LRN:
+            if (node->nn_param.lrn.axis == (int32_t)inputBtachNum - 1)
+            {
+                ret = FALSE;
+            }
+            break;
+        case VSI_NN_OP_MOMENTS:
+            for (i = 0; i < (uint32_t)node->nn_param.moments.axis_num; i++)
+            {
+                int index = node->nn_param.moments.axis[i];
+                if (index == (int32_t)inputBtachNum - 1)
+                {
+                    ret = FALSE;
+                    break;
+                }
+            }
+            break;
+        case VSI_NN_OP_REPEAT:
+            if (node->nn_param.repeat.axis == (int32_t)inputBtachNum - 1)
+            {
+                ret = FALSE;
+            }
+            break;
+        case VSI_NN_OP_GATHER:
+            if (node->nn_param.gather.axis == (int32_t)inputBtachNum - 1)
+            {
+                ret = FALSE;
+            }
+            break;
+        case VSI_NN_OP_GATHER_ELEMENTS:
+            if (node->nn_param.gather_elements.axis == (int32_t)inputBtachNum - 1)
+            {
+                ret = FALSE;
+            }
+            break;
+        case VSI_NN_OP_SCATTER_ELEMENTS:
+            if (node->nn_param.scatter_elements.axis == (int32_t)inputBtachNum - 1)
+            {
+                ret = FALSE;
+            }
+            break;
+        case VSI_NN_OP_SHUFFLECHANNEL:
+            if (node->nn_param.shufflechannel.axis == (int32_t)inputBtachNum - 1)
+            {
+                ret = FALSE;
+            }
+            break;
+        case VSI_NN_OP_TOPK:
+            if (node->nn_param.topk.axis == (int32_t)inputBtachNum - 1)
+            {
+                ret = FALSE;
+            }
+            break;
+        default:
+            break;
+    }
+
+    return ret;
+}
+
+static vsi_status batchInference_graph
+(
+    vsi_nn_graph_t* graph,
+    vsi_nn_node_id_t* nodes_list
+)
+{
+    vsi_size_t i, j, k;
+    vsi_status status;
+    vsi_bool ret;
+    vsi_nn_tensor_t* tensor = NULL;
+    vsi_nn_tensor_t** inputs = NULL;
+    vsi_nn_tensor_t** outputs = NULL;
+    vsi_nn_tensor_attr_t* original_inputs_attr = NULL;
+    vsi_nn_tensor_attr_t* original_outputs_attr = NULL;
+    vsi_nn_tensor_id_t* approximateConstTensor = NULL;
+    vsi_size_t approximateConstTensor_count = 0;
+    vsi_bool has_inputTensor = FALSE;
+    vsi_nn_node_id_t node_id;
+    vsi_nn_node_t* node;
+    vsi_size_t num_of_node_inputs = 0;
+    vsi_size_t batchCount = 0;
+    vsi_size_t batchNum = 1;
+
+    vx_hardware_caps_params_t   hw_param;
+    vx_context  ctx = vxGetContext((vx_reference)graph->g);
+
+    for (i = 0; i < graph->node_num; i++)
+    {
+        node_id = nodes_list[i];
+        node = vsi_nn_GetNode(graph, node_id);
+        /* For NBG node, donot infer shape*/
+        if (node && node->op == VSI_NN_OP_NBG)
+        {
+            status = VSI_SUCCESS;
+            goto final;
+        }
+    }
+
+    memset(&hw_param, 0, sizeof(vx_hardware_caps_params_t));
+    status = vxQueryHardwareCaps(ctx, &hw_param, sizeof(vx_hardware_caps_params_t));
+
+    /*initial tensor shape*/
+    status = setup_node(graph, nodes_list);
+    if (VSI_SUCCESS != status)
+    {
+        goto final;
+    }
+
+    status = VSI_SUCCESS;
+    ret = TRUE;
+    inputs = allocate_io_buffer(graph);
+    outputs = allocate_io_buffer(graph);
+    original_inputs_attr = (vsi_nn_tensor_attr_t*)malloc(sizeof(vsi_nn_tensor_attr_t) * graph->max_node_io);
+    original_outputs_attr = (vsi_nn_tensor_attr_t*)malloc(sizeof(vsi_nn_tensor_attr_t) * graph->max_node_io);
+    approximateConstTensor = (vsi_nn_tensor_id_t*)malloc(sizeof(vsi_nn_tensor_id_t) * graph->tensor_num);
+    memset(approximateConstTensor, -1, sizeof(vsi_nn_tensor_id_t) * graph->tensor_num);
+
+    if (NULL == inputs || NULL == outputs || NULL == original_inputs_attr || NULL == original_outputs_attr)
+    {
+        VSILOGE("allocate buffer fail");
+        status = VSI_FAILURE;
+        goto final;
+    }
+
+    for (i = 0; i < graph->node_num; i++)
+    {
+        node_id = nodes_list[i];
+        memset(inputs, 0, graph->max_node_io * sizeof(vsi_nn_tensor_t*));
+        memset(outputs, 0, graph->max_node_io * sizeof(vsi_nn_tensor_t*));
+        memset(original_inputs_attr, 0, graph->max_node_io * sizeof(vsi_nn_tensor_attr_t));
+        memset(original_outputs_attr, 0, graph->max_node_io * sizeof(vsi_nn_tensor_attr_t));
+
+        /* Get inputs, outputs. */
+        node = vsi_nn_GetNode(graph, node_id);
+        CHECK_PTR_FAIL_GOTO(node, "Get node fail.", final);
+
+        vsi_nn_GetTensors(graph, node->input.tensors,
+            node->input.num, inputs);
+        vsi_nn_GetTensors(graph, node->output.tensors,
+            node->output.num, outputs);
+        batchNum = 1;
+        /*get input batch number*/
+        has_inputTensor = FALSE;
+        for (j = 0; j < node->input.num; j++)
+        {
+            vx_bool is_const = FALSE;
+            if (inputs[j] == NULL)
+            {
+                continue;
+            }
+            memcpy(&original_inputs_attr[j], &inputs[j]->attr, sizeof(vsi_nn_tensor_attr_t));
+            for (k = 0; k < approximateConstTensor_count; k++)
+            {
+                if (node->input.tensors[j] == approximateConstTensor[k])
+                {
+                    is_const = TRUE;
+                }
+            }
+            if (inputs[j]->attr.is_const != TRUE && is_const != TRUE)
+            {
+                has_inputTensor = TRUE;
+                if (batchNum < inputs[j]->attr.size[inputs[j]->attr.dim_num - 1])
+                {
+                    batchNum = inputs[j]->attr.size[inputs[j]->attr.dim_num - 1];
+                }
+            }
+        }
+
+        for (j = 0; j < node->output.num; j++)
+        {
+            if (outputs[j] == NULL)
+            {
+                continue;
+            }
+            memcpy(&original_outputs_attr[j], &outputs[j]->attr, sizeof(vsi_nn_tensor_attr_t));
+            if (!has_inputTensor)
+            {
+                approximateConstTensor[approximateConstTensor_count++] = node->output.tensors[j];
+            }
+            if (original_outputs_attr[j].dim_num < 1)
+            {
+                break;
+            }
+        }
+        if (j != node->output.num)
+        {
+            continue;
+        }
+
+        if (batchNum > 1 && canBatchSplit(node, original_inputs_attr[0].dim_num))
+        {
+            vsi_size_t iterator_list_index = 0;
+            vsi_size_t list_index = 0;
+            vsi_size_t* iterator_list = (vsi_size_t*)malloc(sizeof(vsi_size_t) * (batchNum + 1));
+            memset(iterator_list, 0, sizeof(uint32_t) * (batchNum + 1));
+
+            if (((vsi_nn_node_prv_t*)node)->split_num > 0)
+            {/*user defined batch count*/
+                iterator_list[iterator_list_index++] = ((vsi_nn_node_prv_t*)node)->split_num;
+                if (((vsi_nn_node_prv_t*)node)->split_num == 1)
+                {/*if user set split_num = 1, there is no need to batch split.*/
+                    continue;
+                }
+            }
+            /*iterate through each vaild batch count*/
+            for (batchCount = batchNum; batchCount > 1; batchCount--)
+            {
+
+                /*for some node with big batch num, should limit to max core count.*/
+                if (batchCount > (hw_param.coreCount == 0?24 : hw_param.coreCount))
+                {
+                    continue;
+                }
+                if (batchNum % batchCount != 0)
+                {
+                    continue;
+                }
+                iterator_list[iterator_list_index++] = batchCount;
+            }
+
+            /*iterate through each vaild batch count*/
+            for (list_index = 0; list_index < iterator_list_index; list_index++)
+            {
+                batchCount = iterator_list[list_index];
+
+                /*set node input batch*/
+                num_of_node_inputs = node->input.num;
+                for (k = 0; k < num_of_node_inputs; k++)
+                {
+                    tensor = inputs[k];
+                    if (tensor)
+                    {
+                        vx_bool is_const = FALSE;
+                        uint32_t index = 0;
+                        for (index = 0; index < approximateConstTensor_count; index++)
+                        {
+                            if (node->input.tensors[k] == approximateConstTensor[index])
+                            {
+                                is_const = TRUE;
+                            }
+                        }
+                        if (is_const != TRUE && tensor->attr.is_const != TRUE)
+                        {
+                            if (original_inputs_attr[k].size[tensor->attr.dim_num - 1] / batchCount < 1
+                                || original_inputs_attr[k].size[tensor->attr.dim_num - 1] % batchCount != 0)
+                            {
+                                break;
+                            }
+                            else
+                            {
+                                tensor->attr.size[tensor->attr.dim_num - 1] =
+                                    original_inputs_attr[k].size[tensor->attr.dim_num - 1] / batchCount;
+                            }
+                        }
+                    }
+                }
+                if (k != num_of_node_inputs)
+                {
+                    continue;
+                }
+
+                /*reset output tensor size, dim_num and other parameter,
+                    if not, it will affect vsi_nn_OpGenerateTensor*/
+                for (j = 0; j < node->output.num; j++)
+                {
+                    if (outputs[j] == NULL)
+                    {
+                        continue;
+                    }
+                    outputs[j]->attr.dim_num = VSI_NN_DIM_AUTO;
+                    for (k = 0; k < VSI_NN_MAX_DIM_NUM; k++)
+                    {
+                        outputs[j]->attr.size[k] = 0;
+                    }
+                }
+                if (node->internal_node_wksp != NULL)
+                {
+                    vsi_nn_internal_init_node_wksp(node);
+                }
+
+                /*node shape inference: */
+                if (vsi_nn_OpCheck(node->op, node, inputs, outputs))
+                {
+                    vsi_nn_print_node_io(graph, node, 0x01);
+                    ret = vsi_nn_OpGenerateTensor(node, inputs, outputs);
+                    if (ret != TRUE)
+                    {
+                        VSILOGD("Cannot split node[%u] %s on input_batch_count=%u",
+                            node_id, vsi_nn_OpGetName(node->op), batchCount);
+                        continue;
+                    }
+                    vsi_nn_print_node_io(graph, node, 0x02);
+
+                    /*check if the node can be splited on batch*/
+                    for (j = 0; j < node->output.num; j++)
+                    {
+                        if (outputs[j] == NULL)
+                        {
+                            continue;
+                        }
+
+                        tensor = outputs[j];
+                        /*can be splited if the batch dim size of the output shape is changed.*/
+                        if (tensor->attr.size[tensor->attr.dim_num - 1] ==
+                            original_outputs_attr[j].size[original_outputs_attr[j].dim_num - 1])
+                        {
+                            VSILOGD("Cannot split node[%u] %s on input_batch_count=%u",
+                                    node_id,
+                                    vsi_nn_OpGetName(node->op),
+                                    batchCount);
+                            break;
+                        }
+                    }
+
+                    if (j == node->output.num )
+                    {
+                        /*save the verified batch count*/
+                        ((vsi_nn_node_prv_t*)node)->split_num = batchCount;
+                        break;
+                    }
+                }
+                else
+                {
+                    VSILOGD("Cannot split node[%u] %s on input_batch_count=%u",
+                    node_id,
+                    vsi_nn_OpGetName(node->op),
+                    batchCount);
+                    continue;
+                }
+            }
+
+            /*restore node input batch number*/
+            num_of_node_inputs = node->input.num;
+            for (k = 0; k < num_of_node_inputs; k++)
+            {
+                tensor = inputs[k];
+                if (tensor)
+                {
+                    tensor->attr.size[tensor->attr.dim_num - 1] =
+                        original_inputs_attr[k].size[tensor->attr.dim_num - 1] ;
+                }
+            }
+
+            /*reset the output tensors*/
+            for (j = 0; j < node->output.num; j++)
+            {
+                if (outputs[j] == NULL)
+                {
+                    continue;
+                }
+                outputs[j]->attr.dim_num = VSI_NN_DIM_AUTO;
+                for (k = 0; k < VSI_NN_MAX_DIM_NUM; k++)
+                {
+                    outputs[j]->attr.size[k] = 0;
+                }
+            }
+            if (node->internal_node_wksp != NULL)
+            {
+                vsi_nn_internal_init_node_wksp(node);
+            }
+
+            /*restore node output shape*/
+            if (vsi_nn_OpCheck(node->op, node, inputs, outputs))
+            {
+                ret = vsi_nn_OpGenerateTensor(node, inputs, outputs);
+            }
+        }
+    }
+
+    final:
+    for (i = 0; i < graph->node_num; i++)
+    {
+        node_id = nodes_list[i];
+        node = vsi_nn_GetNode(graph, node_id);
+        if (node == NULL || node->op == VSI_NN_OP_NBG)
+        {
+            break;
+        }
+
+        vsi_nn_GetTensors(graph, node->input.tensors,
+            node->input.num, inputs);
+        vsi_nn_GetTensors(graph, node->output.tensors,
+            node->output.num, outputs);
+        for (j = 0; j < node->output.num; j++)
+        {
+            if (outputs[j] == NULL)
+            {
+                continue;
+            }
+            /*reset attr->size*/
+            outputs[j]->attr.dim_num = VSI_NN_DIM_AUTO;
+            for (k = 0; k < VSI_NN_MAX_DIM_NUM; k++)
+            {
+                outputs[j]->attr.size[k] = 0;
+            }
+        }
+        if (node->internal_node_wksp != NULL)
+        {
+            vsi_nn_internal_init_node_wksp(node);
+        }
+    }
+
+    free_io_buffer(inputs);
+    free_io_buffer(outputs);
+
+    if (original_inputs_attr != NULL)
+    {
+        free(original_inputs_attr);
+    }
+    if (original_outputs_attr != NULL)
+    {
+        free(original_outputs_attr);
+    }
+    if (approximateConstTensor != NULL)
+    {
+        free(approximateConstTensor);
+    }
+
+    return status;
+} /* batchInference_graph() */
+
+static vsi_status update_vxnode_batchNum
+(
+    vsi_nn_graph_t* graph,
+    vsi_nn_node_id_t* node_list
+)
+{
+    uint32_t i, j;
+    vsi_status status;
+    vsi_nn_node_id_t node_id;
+    vsi_nn_node_t* node;
+    vsi_nn_internal_node_t* inode;
+
+    status = VSI_SUCCESS;
+    for (i = 0; i < graph->node_num; i++)
+    {
+        node_id = node_list[i];
+        node = vsi_nn_GetNode(graph, node_id);
+        CHECK_PTR_FAIL_GOTO(node, "Get node fail.", final);
+        if (node->n != NULL)
+        {
+            vxSetNodeBatch(node->n, (uint32_t)((vsi_nn_node_prv_t*)node)->split_num);
+            if (((vsi_nn_node_prv_t*)node)->split_num > 1)
+            {
+                VSILOGD("split node[%u] %s to %ds on batch dim",
+                    node_id,
+                    vsi_nn_OpGetName(node->op),
+                    ((vsi_nn_node_prv_t*)node)->split_num);
+            }
+        }
+
+        for (j = 1; j < 100; j++)
+        {
+            inode = vsi_nn_internal_get_node_by_uid(node, j);
+            if (inode == NULL)
+            {
+                break;
+            }
+            else
+            {
+                if (inode->node->n != NULL)
+                {
+                    vxSetNodeBatch(inode->node->n, (uint32_t)((vsi_nn_node_prv_t*)node)->split_num);
+                }
+            }
+        }
+
+    }
+
+    final:
+    return status;
+} /* update_vxnode_batchNum() */
+#endif
+
+vsi_status vsi_nn_InferShape
+(
+    vsi_nn_graph_t* graph
+)
+{
+    uint32_t i, j, k;
+    vsi_status status;
+    vsi_nn_tensor_t** outputs = NULL;
+    vsi_nn_node_t* node;
+    vsi_nn_node_id_t* nodes_list = NULL;
+    status = VSI_SUCCESS;
+
+    for (i = 0; i < graph->node_num; i++)
+    {
+        node = vsi_nn_GetNode(graph, i);
+        /* For NBG node, donot infer shape*/
+        if (node && node->op == VSI_NN_OP_NBG)
+        {
+            status = VSI_FAILURE;
+            goto final;
+        }
+    }
+
+    outputs = allocate_io_buffer(graph);
+    if (NULL == outputs)
+    {
+        VSILOGE("allocate buffer fail");
+        status = VSI_FAILURE;
+        goto final;
+    }
+
+    /*reset all nodes' output shape*/
+    for (i = 0; i < graph->node_num; i++)
+    {
+        memset(outputs, 0, graph->max_node_io * sizeof(vsi_nn_tensor_t*));
+        node = vsi_nn_GetNode(graph, i);
+        CHECK_PTR_FAIL_GOTO(node, "Get node fail.", final);
+
+        vsi_nn_GetTensors(graph, node->output.tensors,
+            node->output.num, outputs);
+        CHECK_PTR_FAIL_GOTO(outputs, "Get node's output fail.", final);
+        for (j = 0; j < node->output.num; j++)
+        {
+            if (outputs[j] == NULL)
+            {
+                continue;
+            }
+            /*reset attr->size*/
+            outputs[j]->attr.dim_num = VSI_NN_DIM_AUTO;
+            for (k = 0; k < VSI_NN_MAX_DIM_NUM; k++)
+            {
+                outputs[j]->attr.size[k] = 0;
+            }
+        }
+        if (node->internal_node_wksp != NULL)
+        {
+            vsi_nn_internal_init_node_wksp(node);
+        }
+    }
+
+    /*setup nodes.*/
+    nodes_list = (vsi_nn_node_id_t*)malloc(
+        graph->node_num * sizeof(vsi_nn_node_id_t));
+    if (!nodes_list)
+    {
+        goto final;
+    }
+    for (i = 0; i < graph->node_num; i++)
+    {
+        nodes_list[i] = i;
+    }
+
+    status = setup_node(graph, nodes_list);
+    if (VSI_SUCCESS != status)
+    {
+        goto final;
+    }
+
+    final:
+    free_io_buffer(outputs);
+    if (NULL != nodes_list)
+    {
+        free(nodes_list);
+    }
+
+    return status;
+}
+
 static vsi_status set_graph_precision
     (
     vsi_nn_graph_t * graph,
@@ -809,6 +1495,18 @@ vsi_status vsi_nn_SetupGraph
         goto final;
     }
 
+#if VX_GRAPH_BATCH_OPT_SUPPORT
+    if (graph->ctx->options.enable_batch_opt)
+    {
+        /*processing batch splitting*/
+        status = batchInference_graph(graph, nodes_list);
+        if (VSI_SUCCESS != status)
+        {
+            goto final;
+        }
+    }
+#endif
+
     /* Preprocess node and tensor */
     status = setup_node( graph, nodes_list );
     if(VSI_SUCCESS != status)
@@ -838,6 +1536,14 @@ vsi_status vsi_nn_SetupGraph
         goto final;
     }
 
+#if VX_GRAPH_BATCH_OPT_SUPPORT
+    /* update vxnode's batch_count */
+    status = update_vxnode_batchNum(graph, nodes_list);
+    if (VSI_SUCCESS != status)
+    {
+        goto final;
+    }
+#endif
     /* set precision again to make sure any tensor created by compute_node have correct precesion infor*/
     status = set_graph_precision(graph, nodes_list);
     if(VSI_SUCCESS != status)
@@ -1011,7 +1717,8 @@ static vsi_nn_tensor_id_t _add_tensor
     vsi_nn_graph_t       * graph,
     vsi_nn_tensor_id_t     id,
     vsi_nn_tensor_attr_t * attr,
-    uint8_t             * data
+    uint8_t             *  data,
+    int8_t                 is_from_axisram
     )
 {
     vsi_nn_tensor_t * tensor;
@@ -1043,11 +1750,26 @@ static vsi_nn_tensor_id_t _add_tensor
     }
     else if( NULL != data )
     {
-        tensor = vsi_nn_CreateTensorFromData( graph, data, attr );
+        if (TRUE == is_from_axisram)
+        {
+            VSILOGE("Can't create a tensor from AXI-SRAM with data.");
+        }
+        else
+        {
+            tensor = vsi_nn_CreateTensorFromData( graph, data, attr );
+        }
     }
     else
     {
-        tensor = vsi_nn_CreateTensor( graph, attr );
+        if (TRUE == is_from_axisram)
+        {
+            tensor = vsi_nn_CreateTensorFromAXISRAM(graph, attr);
+        }
+        else
+        {
+            tensor = vsi_nn_CreateTensor(graph, attr);
+        }
+
     }
 
     if( NULL != tensor )
@@ -1071,7 +1793,7 @@ vsi_nn_tensor_id_t vsi_nn_AddTensor
     )
 {
     attr->is_created_from_handle = FALSE;
-    return _add_tensor(graph, id, attr, data);
+    return _add_tensor(graph, id, attr, data, FALSE);
 } /* vsi_nn_AddTensor() */
 
 vsi_nn_tensor_id_t vsi_nn_AddTensorFromHandle
@@ -1083,7 +1805,7 @@ vsi_nn_tensor_id_t vsi_nn_AddTensorFromHandle
     )
 {
     attr->is_created_from_handle = TRUE;
-    return _add_tensor(graph, id, attr, data);
+    return _add_tensor(graph, id, attr, data, FALSE);
 }
 
 vsi_nn_tensor_id_t vsi_nn_AddTensorFromView
@@ -1116,7 +1838,7 @@ vsi_nn_tensor_id_t vsi_nn_AddTensorFromView
     {
         attr.size[i] = end[i] - start[i];
     }
-    id = _add_tensor(graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL);
+    id = _add_tensor(graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL, FALSE);
     if (VSI_NN_TENSOR_ID_NA == id)
     {
         VSILOGE("Create view tensor failed, new tensor could not be created.");
@@ -1150,6 +1872,16 @@ final:
     return id;
 }
 
+vsi_nn_tensor_id_t vsi_nn_AddTensorFromAXISRAM
+    (
+    vsi_nn_graph_t       * graph,
+    vsi_nn_tensor_id_t     id,
+    vsi_nn_tensor_attr_t * attr
+    )
+{
+    return _add_tensor(graph, id, attr, NULL, TRUE);
+} /* vsi_nn_AddTensorFromAXISRAM() */
+
 vsi_nn_tensor_id_t vsi_nn_AttachTensorToGraph
     (
     vsi_nn_graph_t       * graph,
diff --git a/src/tim/vx/internal/src/vsi_nn_log.c b/src/tim/vx/internal/src/vsi_nn_log.c
index 25d421b..f617359 100644
--- a/src/tim/vx/internal/src/vsi_nn_log.c
+++ b/src/tim/vx/internal/src/vsi_nn_log.c
@@ -29,37 +29,11 @@
 #include "vsi_nn_log.h"
 #include "vsi_nn_types.h"
 
-#ifdef __ANDROID__
-#if ANDROID_SDK_VERSION >= 30
+#if (defined(__ANDROID__)) && (ANDROID_SDK_VERSION >= 30)
 static const char* ENV_LOG_LEVEL = "vendor.VSI_NN_LOG_LEVEL";
 #else
 static const char* ENV_LOG_LEVEL = "VSI_NN_LOG_LEVEL";
 #endif
-#else
-static const char* ENV_LOG_LEVEL = "VSI_NN_LOG_LEVEL";
-#endif
-
-int get_env_as_int(const char* env, int default_value) {
-    int value = default_value;
-    #ifdef __ANDROID__
-    {
-        char value_str[100];
-        int status = __system_property_get(env, value_str);
-        if (status) {
-            value = atoi(value_str);
-        }
-    }
-    #else
-    {
-        char* env_s = vsi_nn_getenv(env);
-        if (env_s) {
-            value = atoi(env_s);
-        }
-    }
-    #endif
-
-    return value;
-}
 
 static vsi_bool _check_log_level
     (
@@ -70,7 +44,7 @@ static vsi_bool _check_log_level
 
     if(env_level == VSI_NN_LOG_UNINIT)
     {
-        env_level = (vsi_nn_log_level_e)get_env_as_int(ENV_LOG_LEVEL, VSI_NN_LOG_WARN);
+        env_level = (vsi_nn_log_level_e)vsi_nn_getenv_asint(ENV_LOG_LEVEL, VSI_NN_LOG_WARN);
     }
 
     if(env_level >= level)
diff --git a/src/tim/vx/internal/src/vsi_nn_node.c b/src/tim/vx/internal/src/vsi_nn_node.c
index 641888e..a284a27 100644
--- a/src/tim/vx/internal/src/vsi_nn_node.c
+++ b/src/tim/vx/internal/src/vsi_nn_node.c
@@ -212,6 +212,7 @@ void vsi_nn_PrintNode
         }
         count += temp;
     }
+    count --;
     temp = snprintf( &buf[count], _MAX_PRINT_BUF_SZ - count,
         "%s", " ], [out:" );
     if ( temp >= _MAX_PRINT_BUF_SZ - count || temp == -1 )
@@ -224,7 +225,7 @@ void vsi_nn_PrintNode
     {
         /* -3 means reserve memory for ending symbols --" ]" */
         temp = snprintf( &buf[count], _MAX_PRINT_BUF_SZ - count - 3,
-            " %d,", node->input.tensors[i] );
+            " %d,", node->output.tensors[i] );
         if ( temp >= _MAX_PRINT_BUF_SZ - count - 3 || temp == -1 )
         {
             is_out_of_bound = TRUE;
@@ -232,6 +233,7 @@ void vsi_nn_PrintNode
         }
         count += temp;
     }
+    count --;
     count += snprintf( &buf[count], _MAX_PRINT_BUF_SZ - count,
         "%s", " ]" );
 final:
@@ -243,6 +245,26 @@ final:
     VSILOGI( "(%16s)node[%u] %s [%08x]", vsi_nn_OpGetName(node->op), id, buf, node->n );
 } /* vsi_nn_PrintNode() */
 
+#if VX_GRAPH_BATCH_OPT_SUPPORT
+vsi_status vsi_nn_SetNodeBatchSplitNum
+(
+    vsi_nn_node_t* node,
+    int8_t split_num
+)
+{
+    vsi_status status = VSI_SUCCESS;
+    if (node == NULL || split_num < 1)
+    {
+        status = VSI_FAILURE;
+        goto final;
+    }
+    ((vsi_nn_node_prv_t*)node)->split_num = split_num;
+
+    final:
+    return status;
+}
+#endif
+
 vsi_status vsi_nn_update_node_attr
     (
     vsi_nn_node_t *node
diff --git a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
index ca565da..4a9caea 100644
--- a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
+++ b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
@@ -207,6 +207,7 @@ static _node_template s_template[] =
     /* LPNORM */                NULL,
     /* RESIZE_3D */                NULL,
     /* REDUCEL2 */              NULL,
+    /* CROP_AND_RESIZE */       NULL,
 };
 //_compiler_assert( _cnt_of_array(s_template) == VSI_NN_OP_NUM, vsi_nn_node_attr_template_c );
 
diff --git a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
index 3a9ac63..c6e9daa 100644
--- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
+++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
@@ -89,7 +89,9 @@ static void _create_multi_norm_tensors
             multi_input_tensors[2] = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &uv_input_attr, NULL);
         }
         else if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12 ||
-                *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV21)
+                 *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV21 ||
+                 *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12_RGGB ||
+                 *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV21_BGGR)
         {
             uv_input_attr = *input_attr;
             uv_input_attr.size[0] = w;
@@ -445,7 +447,9 @@ static void _get_org_graph_inputs
                     i += 2 ;
                 }
                 else if(nodes[0]->nn_param.pre_process.type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12 ||
-                        nodes[0]->nn_param.pre_process.type == VSI_NN_SOURCE_FORMAT_IMAGE_NV21 )
+                        nodes[0]->nn_param.pre_process.type == VSI_NN_SOURCE_FORMAT_IMAGE_NV21 ||
+                        nodes[0]->nn_param.pre_process.type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12_RGGB ||
+                        nodes[0]->nn_param.pre_process.type == VSI_NN_SOURCE_FORMAT_IMAGE_NV21_BGGR)
                 {
                     i += 1;
                 }
@@ -558,7 +562,9 @@ vsi_status vsi_nn_add_single_preproc_node
         node_input_num = 3;
     }
     else if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12 ||
-             *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV21)
+             *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV21 ||
+             *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12_RGGB ||
+             *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV21_BGGR)
     {
         node_input_num = 2;
     }
@@ -607,7 +613,9 @@ vsi_status vsi_nn_add_single_preproc_node
         *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12 ||
         *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV21 ||
         *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444 ||
-        *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP)
+        *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP ||
+        *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12_RGGB ||
+        *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV21_BGGR)
     {
         _create_multi_norm_tensors(graph, &input_attr, source_layout, source_format, preproc_inputs);
     }
diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c
index a333d42..d44ecf8 100644
--- a/src/tim/vx/internal/src/vsi_nn_tensor.c
+++ b/src/tim/vx/internal/src/vsi_nn_tensor.c
@@ -66,7 +66,8 @@ static vsi_nn_tensor_t * _create_tensor
     (
     vsi_nn_graph_t       * graph,
     uint8_t              * data,
-    vsi_nn_tensor_attr_t * attr
+    vsi_nn_tensor_attr_t * attr,
+    int8_t                 is_from_axisram
     );
 
 static vsi_size_t get_tensor_elements_num
@@ -568,6 +569,16 @@ static vsi_bool _init_tensor
     {
         tensor->t = vxCreateTensor2( graph->ctx->c,
             &params, sizeof( vx_tensor_create_params_t ) );
+#ifdef VSI_CREATE_TENSOR_FROM_AXISRAM_SUPPORT
+        if (TRUE == _get_tensor_is_from_axisram((vsi_nn_tensor_prv_t*)tensor))
+        {
+            vx_enum pool_type = VX_VIV_MEM_POOL_TYPE_AXI_SRAM;
+            vxSetTensorAttribute(tensor->t,
+                                 VX_TENSOR_MEMORY_POOL_TYPE,
+                                 &pool_type,
+                                 sizeof(vx_enum));
+        }
+#endif
     }
     else
     {
@@ -596,16 +607,21 @@ static vsi_bool _init_tensor
     if( !tensor->attr.vtl && !tensor->attr.is_const )
     {
         //norm tensor need to fill initial value
-        if( ( !tensor->attr.is_created_from_handle ) || tensor->attr.is_handle_malloc_by_ovxlib )
+#ifdef VSI_CREATE_TENSOR_FROM_AXISRAM_SUPPORT
+        if (TRUE != _get_tensor_is_from_axisram((vsi_nn_tensor_prv_t*)tensor))
+#endif
         {
-            vsi_nn_FillTensorWithValue( graph, tensor, 0.0f );
-            if(tensor->attr.is_created_from_handle)
+            if( ( !tensor->attr.is_created_from_handle ) || tensor->attr.is_handle_malloc_by_ovxlib)
             {
-                vsi_status status = vxFlushHandle( (vx_reference)tensor->t );
-                if (VSI_SUCCESS != status)
+                vsi_nn_FillTensorWithValue( graph, tensor, 0.0f );
+                if(tensor->attr.is_created_from_handle)
                 {
-                    ret = FALSE;
-                    goto final;
+                    vsi_status status = vxFlushHandle( (vx_reference)tensor->t );
+                    if (VSI_SUCCESS != status)
+                    {
+                        ret = FALSE;
+                        goto final;
+                    }
                 }
             }
         }
@@ -654,7 +670,8 @@ static vsi_nn_tensor_t * _create_tensor
     (
     vsi_nn_graph_t       * graph,
     uint8_t              * data,
-    vsi_nn_tensor_attr_t * attr
+    vsi_nn_tensor_attr_t * attr,
+    int8_t                 is_from_axisram
     )
 {
     vsi_nn_tensor_prv_t * tensor;
@@ -673,6 +690,10 @@ static vsi_nn_tensor_t * _create_tensor
         memset( tensor, 0, sizeof( vsi_nn_tensor_prv_t ) );
         memcpy( &tensor->pot.attr, attr, sizeof( vsi_nn_tensor_attr_t ) );
         tensor->pot.is_swapped = FALSE;
+        if (TRUE == is_from_axisram)
+        {
+            tensor->is_from_axisram = is_from_axisram;
+        }
         if( attr->dim_num != VSI_NN_DIM_AUTO )
         {
             _init_tensor( graph, &tensor->pot, data);
@@ -694,7 +715,7 @@ vsi_nn_tensor_t * vsi_nn_CreateTensor
     )
 {
     attr->is_created_from_handle = FALSE;
-    return _create_tensor(graph, NULL, attr);
+    return _create_tensor(graph, NULL, attr, FALSE);
 } /* vsi_nn_CreateTensor() */
 
 vsi_nn_tensor_t * vsi_nn_CreateTensorFromHandle
@@ -727,7 +748,7 @@ vsi_nn_tensor_t * vsi_nn_CreateTensorFromHandle
     }
     else
     {
-        ptensor = _create_tensor(graph, data, attr);
+        ptensor = _create_tensor(graph, data, attr, FALSE);
     }
 
  final:
@@ -3115,6 +3136,39 @@ vsi_status _set_tensor_is_scalar
     return status;
 }
 
+int8_t _get_tensor_is_from_axisram
+    (
+    vsi_nn_tensor_prv_t* tensor
+    )
+{
+    int8_t is_from_axisram = FALSE;
+    if (NULL == tensor) {
+        VSILOGE("To get is_scalar, tensor pointer SHOULD NOT be NULL.");
+        goto final;
+    }
+    is_from_axisram = tensor->is_from_axisram;
+
+final:
+    return is_from_axisram;
+}
+
+vsi_status _set_tensor_is_from_axisram
+    (
+    vsi_nn_tensor_prv_t* tensor,
+    int8_t is_from_axisram
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    if (NULL == tensor) {
+        status = VSI_FAILURE;
+        goto final;
+    }
+    tensor->is_from_axisram = is_from_axisram;
+
+final:
+    return status;
+}
+
 static vsi_bool _init_dummy_tensor
     (
     vsi_nn_graph_t  * graph,
@@ -3314,3 +3368,106 @@ vsi_nn_tensor_t * vsi_nn_create_dummy_tensor
     attr->is_created_from_handle = FALSE;
     return _create_dummy_tensor(graph, attr);
 } /* vsi_nn_create_dummy_tensor() */
+
+vsi_status vsi_nn_MapTensorPatch
+    (
+    vsi_nn_graph_t* graph,
+    vsi_nn_tensor_t* tensor,
+    void** ptr,
+    vsi_nn_accessor_type_e usage
+    )
+{
+    vsi_status status = VSI_FAILURE;
+#ifdef VSI_MAP_TENSOR_PATCH_SUPPORT
+    size_t dim, i;
+    vsi_size_t tem_stride[VSI_NN_MAX_DIM_NUM];
+    vx_size start[VSI_NN_MAX_DIM_NUM], end[VSI_NN_MAX_DIM_NUM],
+        stride[VSI_NN_MAX_DIM_NUM];
+    vx_map_id map_id = 0;
+
+    if (NULL == graph || NULL == tensor || NULL == ptr)
+    {
+        VSILOGE("Invalid parameter");
+        return status;
+    }
+    if (TRUE == tensor->attr.vtl)
+    {
+        VSILOGE("Can not access a virtual tensor.");
+        return status;
+    }
+    vsi_nn_GetStrideSize(&tensor->attr, tem_stride);
+
+    memset(start, 0, sizeof(vx_size) * VSI_NN_MAX_DIM_NUM);
+    dim = (size_t)tensor->attr.dim_num;
+    for (i = 0; i < dim; i++)
+    {
+        end[i] = (size_t)tensor->attr.size[i];
+        stride[i] = (size_t)tem_stride[i];
+    }
+
+    status = vxMapTensorPatch(tensor->t,dim,start,end,
+                         &map_id,stride,ptr,usage, VX_MEMORY_TYPE_HOST);
+    ((vsi_nn_tensor_prv_t*)tensor)->map_id = map_id;
+#else
+    VSI_UNREFERENCED(graph);
+    VSI_UNREFERENCED(tensor);
+    VSI_UNREFERENCED(ptr);
+    VSI_UNREFERENCED(usage);
+    VSILOGE("Function unspported, please upgrade OpenVX driver to 1.3.0!");
+#endif
+    return status;
+} /* vsi_nn_MapTensorPatch() */
+
+vsi_status vsi_nn_UnmapTensorPatch
+    (
+    vsi_nn_graph_t* graph,
+    vsi_nn_tensor_t* tensor
+    )
+{
+    vsi_status status = VSI_FAILURE;
+#ifdef VSI_MAP_TENSOR_PATCH_SUPPORT
+    vx_map_id map_id = 0;
+
+    if (NULL == graph || NULL == tensor)
+    {
+        VSILOGE("Invalid parameter");
+        return status;
+    }
+    if (TRUE == tensor->attr.vtl)
+    {
+        VSILOGE("Can not access a virtual tensor.");
+        return status;
+    }
+
+    map_id = ((vsi_nn_tensor_prv_t*)tensor)->map_id;
+    status = vxUnmapTensorPatch(tensor->t, map_id);
+#else
+    VSI_UNREFERENCED(graph);
+    VSI_UNREFERENCED(tensor);
+    VSILOGE("Function unspported, please upgrade OpenVX driver to 1.3.0!");
+#endif
+    return status;
+} /* vsi_nn_UnmapTensorPatch() */
+
+vsi_nn_tensor_t * vsi_nn_CreateTensorFromAXISRAM
+    (
+    vsi_nn_graph_t       * graph,
+    vsi_nn_tensor_attr_t * attr
+    )
+{
+
+    if (NULL == graph || NULL == attr) {
+        VSILOGE("Invalid parameter");
+        return NULL;
+    }
+    if (TRUE == attr->vtl) {
+        VSILOGE("Can not create tensor from AXI-SRAM for a virtual tensor.");
+        return NULL;
+    }
+#ifdef VSI_CREATE_TENSOR_FROM_AXISRAM_SUPPORT
+    attr->is_created_from_handle = FALSE;
+    return _create_tensor(graph, NULL, attr, TRUE);
+#else
+    return NULL;
+#endif
+} /*vsi_nn_CreateTensorFromAXISRAM*/
diff --git a/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h b/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h
index 1937569..c041c65 100644
--- a/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h
+++ b/src/tim/vx/internal/src/vsi_nn_tensor_util_prv.h
@@ -75,6 +75,17 @@ vsi_status _set_tensor_is_scalar
     int8_t is_salar
     );
 
+int8_t _get_tensor_is_from_axisram
+    (
+    vsi_nn_tensor_prv_t* tensor
+    );
+
+vsi_status _set_tensor_is_from_axisram
+    (
+    vsi_nn_tensor_prv_t* tensor,
+    int8_t is_from_axisram
+    );
+
 /**
  * Create a new dummy tensor
  * Create a new dummy tensor with given attributes.
@@ -107,6 +118,15 @@ vsi_bool vsi_nn_is_same_quant_type(
     vsi_nn_tensor_t * dst
     );
 
+vsi_nn_tensor_t * vsi_nn_kernel_insert_reshape_node
+    (
+        vsi_nn_graph_t    * graph,
+        vsi_nn_tensor_t   * in_tensor,
+        vsi_size_t        * shape,
+        uint32_t            dim_num,
+        vsi_nn_opt_direction_e direction
+    );
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/tim/vx/internal/src/vsi_nn_types_prv.h b/src/tim/vx/internal/src/vsi_nn_types_prv.h
index 81b1d36..00b55fd 100644
--- a/src/tim/vx/internal/src/vsi_nn_types_prv.h
+++ b/src/tim/vx/internal/src/vsi_nn_types_prv.h
@@ -73,6 +73,10 @@ typedef struct _vsi_nn_node_prv
     int8_t processed;
 
     // Add node internal attribute here...
+#if VX_GRAPH_BATCH_OPT_SUPPORT
+    /*split the node to "split_num" on batch dim.*/
+    vsi_size_t split_num;
+#endif
 } vsi_nn_node_prv_t;
 
 /**
@@ -95,6 +99,14 @@ typedef struct _vsi_nn_tensor_prv
      * be done more than once */
     int8_t processed;
 
+    /** For mapping tensor patch.
+    *   map_id The address of a vx_map_id variable where the function returns a map identifier.
+    */
+    vx_map_id map_id;
+
+    /** create tensor from axisram.*/
+    int8_t is_from_axisram;
+
     // Add tensor internal attribute here...
 } vsi_nn_tensor_prv_t;