From ed162d017650c6571b0f80b56ead6fdb7ff33595 Mon Sep 17 00:00:00 2001
From: Feiyue Chen <Feiyue.Chen@verisilicon.com>
Date: Tue, 18 Oct 2022 16:55:15 +0800
Subject: [PATCH] Update internal for 22Q3 release

update internal to commit-id: e2b0fde631fce349e0e3ad42b2a4d40ce7634a97

Type: Code Improvement
Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>
---
 src/tim/vx/internal/include/interface/ops.def |    4 +
 .../internal/include/kernel/vsi_nn_kernel.h   |   11 +
 .../kernel/vsi_nn_kernel_gpu_shape_optimize.h |    6 +
 .../include/kernel/vsi_nn_kernel_lut.h        |    4 +
 .../vsi_nn_op_bidirectional_sequence_rnn.h    |   26 +-
 .../include/ops/vsi_nn_op_bucketize.h         |   48 +
 .../internal/include/ops/vsi_nn_op_conv1d.h   |    1 +
 .../internal/include/ops/vsi_nn_op_conv2d.h   |   15 +
 .../internal/include/ops/vsi_nn_op_conv3d.h   |    1 +
 .../internal/include/ops/vsi_nn_op_deconv3d.h |    1 +
 .../include/ops/vsi_nn_op_depthwise_conv1d.h  |    1 +
 .../include/ops/vsi_nn_op_grouped_conv1d.h    |    1 +
 .../include/ops/vsi_nn_op_grouped_conv2d.h    |    1 +
 .../include/ops/vsi_nn_op_l2normalizescale.h  |    2 +-
 .../include/ops/vsi_nn_op_layernormalize.h    |   10 +-
 .../internal/include/ops/vsi_nn_op_lppool.h   |   46 +
 .../vx/internal/include/ops/vsi_nn_op_pad.h   |    7 -
 .../ops/vsi_nn_op_pre_process_yuv422.h        |   81 +
 .../vx/internal/include/ops/vsi_nn_op_rnn.h   |    9 +-
 .../include/ops/vsi_nn_op_scatter_elements.h  |   49 +
 .../vsi_nn_op_unidirectional_sequence_rnn.h   |    8 +-
 .../include/utils/vsi_nn_dtype_util_prv.h     |    4 +-
 .../vx/internal/include/utils/vsi_nn_util.h   |   10 +
 .../vx/internal/include/vip/virtual_device.h  |   18 +-
 src/tim/vx/internal/include/vsi_nn_context.h  |    1 +
 src/tim/vx/internal/include/vsi_nn_graph.h    |   14 +
 .../vx/internal/include/vsi_nn_node_type.h    |   11 +-
 .../include/vsi_nn_pre_post_process.h         |    2 +
 src/tim/vx/internal/include/vsi_nn_types.h    |   22 +
 src/tim/vx/internal/include/vsi_nn_version.h  |    2 +-
 .../src/kernel/cl/batchnorm_single_cl.c       |    2 +-
 .../vx/internal/src/kernel/cl/bucketize_cl.c  |  303 +
 src/tim/vx/internal/src/kernel/cl/gather_cl.c |   10 +
 src/tim/vx/internal/src/kernel/cl/lppool_cl.c |  332 ++
 .../vx/internal/src/kernel/cl/maximum_cl.c    |    2 +-
 .../vx/internal/src/kernel/cl/minimum_cl.c    |    2 +-
 .../vx/internal/src/kernel/cl/roi_align_cl.c  |   28 +-
 .../src/kernel/cl/scatter_elements_cl.c       |  351 ++
 .../internal/src/kernel/cpu/bucketize_cpu.c   |  229 +
 .../vx/internal/src/kernel/cpu/lppool_cpu.c   |  264 +
 .../vx/internal/src/kernel/cpu/maximum_cpu.c  |   16 +-
 .../vx/internal/src/kernel/cpu/minimum_cpu.c  |   16 +-
 .../src/kernel/cpu/pre_process_yuv422_cpu.c   |  405 ++
 .../internal/src/kernel/cpu/roi_align_cpu.c   |   89 +-
 .../src/kernel/cpu/scatter_elements_cpu.c     |  258 +
 .../internal/src/kernel/evis/bucketize_evis.c |  323 ++
 .../kernel/evis/grucell_activation_z_h_evis.c |    2 +-
 .../kernel/evis/instance_normalization_evis.c |  428 +-
 .../internal/src/kernel/evis/matrixmul_evis.c |   39 +
 .../src/kernel/evis/pre_process_nv12_evis.c   |  148 +-
 .../src/kernel/evis/pre_process_yuv420_evis.c |  129 +-
 .../src/kernel/evis/pre_process_yuv422_evis.c |  623 ++
 .../src/kernel/evis/resize_1d_nearest_evis.c  |    2 +-
 .../src/kernel/evis/resize_bilinear_evis.c    |   40 +-
 .../src/kernel/evis/resize_nearest_evis.c     |    2 +-
 .../vx/internal/src/kernel/evis/select_evis.c |   64 +-
 .../vx/internal/src/kernel/evis/tile_evis.c   |    2 +
 .../src/kernel/sp/layer_norm_y_direction_sp.c |  797 +++
 .../src/kernel/sp/softmax_z_direction_sp.c    |  938 +++
 .../vx/internal/src/kernel/vsi_nn_kernel.c    |   60 +-
 .../kernel/vsi_nn_kernel_gpu_shape_optimize.c |   59 +-
 .../internal/src/kernel/vsi_nn_kernel_lut.c   |   34 +
 .../vx/internal/src/kernel/vx/convolutional.c |   23 +-
 .../internal/src/kernel/vx/eltwise_unary_vx.c |   80 +
 src/tim/vx/internal/src/kernel/vx/pad2_vx.c   |   11 +-
 .../internal/src/libnnext/ops/cl/bucketize.cl |  281 +
 .../vx/internal/src/libnnext/ops/cl/lppool.cl |  115 +
 .../internal/src/libnnext/ops/cl/maximum.cl   |    4 +-
 .../internal/src/libnnext/ops/cl/minimum.cl   |    4 +-
 .../internal/src/libnnext/ops/cl/roi_align.cl |  141 +-
 .../src/libnnext/ops/cl/scatter_elements.cl   |  298 +
 .../libnnext/ops/cl/scatter_elements_add.cl   |  292 +
 .../libnnext/ops/cl/scatter_elements_mul.cl   |  292 +
 .../internal/src/libnnext/ops/vx/bucketize.vx |  176 +
 .../libnnext/ops/vx/group_normalization_1.vx  |    2 +-
 .../ops/vx/instance_normalization_0.vx        |  141 +-
 .../ops/vx/instance_normalization_1.vx        |   75 +-
 .../ops/vx/instance_normalization_2.vx        |   73 +-
 .../ops/vx/instance_normalization_3.vx        |   78 +-
 .../libnnext/ops/vx/l2normalizescale_axis0.vx |    4 +-
 .../src/libnnext/ops/vx/matrixmul_i16.vx      |  141 +
 .../libnnext/ops/vx/pre_process_nv12_copy.vx  |   86 +
 .../libnnext/ops/vx/pre_process_nv12_scale.vx |  318 +-
 .../ops/vx/pre_process_nv12_scale_8bits.vx    |  197 -
 .../ops/vx/pre_process_nv12_scale_mix.vx      |  162 -
 .../ops/vx/pre_process_yuv420_copy.vx         |  238 +
 .../ops/vx/pre_process_yuv420_copy_u8.vx      |  240 -
 .../ops/vx/pre_process_yuv420_scale_0.vx      |  237 +
 .../ops/vx/pre_process_yuv420_scale_1.vx      |  245 +
 .../ops/vx/pre_process_yuv420_scale_fp16.vx   |  232 -
 .../ops/vx/pre_process_yuv420_scale_i16.vx    |  227 -
 .../ops/vx/pre_process_yuv420_scale_i8.vx     |  227 -
 .../ops/vx/pre_process_yuv420_scale_u8.vx     |  228 -
 .../ops/vx/pre_process_yuv422_copy.vx         |   88 +
 .../ops/vx/pre_process_yuv422_scale.vx        |  132 +
 .../vx/internal/src/libnnext/ops/vx/select.vx |   95 +-
 .../src/libnnext/vsi_nn_libnnext_resource.c   | 5019 ++++++++++-------
 .../internal/src/libnnext/vsi_nn_vxkernel.c   |    2 +-
 src/tim/vx/internal/src/makefile.linux        |  293 +-
 .../internal/src/ops/vsi_nn_op_batch2space.c  |   93 +-
 .../vsi_nn_op_bidirectional_sequence_rnn.c    |  129 +-
 .../vx/internal/src/ops/vsi_nn_op_bucketize.c |  208 +
 src/tim/vx/internal/src/ops/vsi_nn_op_clip.c  |   23 +-
 .../vx/internal/src/ops/vsi_nn_op_conv1d.c    |    2 +
 .../vx/internal/src/ops/vsi_nn_op_conv2d.c    |  366 +-
 .../src/ops/vsi_nn_op_conv2d_lstm_cell.c      |    2 +
 .../vx/internal/src/ops/vsi_nn_op_conv3d.c    |    1 +
 .../internal/src/ops/vsi_nn_op_dataconvert.c  |    5 +
 .../src/ops/vsi_nn_op_deconvolution1d.c       |   11 +-
 .../src/ops/vsi_nn_op_depthwise_conv1d.c      |    2 +
 .../vx/internal/src/ops/vsi_nn_op_eltwise.c   |   19 +-
 .../src/ops/vsi_nn_op_expand_broadcast.c      |    6 +-
 .../src/ops/vsi_nn_op_grouped_conv1d.c        |    1 +
 .../src/ops/vsi_nn_op_grouped_conv2d.c        |    1 +
 .../src/ops/vsi_nn_op_l2normalizescale.c      |   17 +-
 .../src/ops/vsi_nn_op_layernormalize.c        |   58 +-
 .../vx/internal/src/ops/vsi_nn_op_lppool.c    |  259 +
 .../src/ops/vsi_nn_op_lstmunit_activation.c   |   20 +-
 .../src/ops/vsi_nn_op_maxpoolwithargmax.c     |    8 +-
 src/tim/vx/internal/src/ops/vsi_nn_op_pad.c   |   12 +-
 src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c  |   29 +-
 src/tim/vx/internal/src/ops/vsi_nn_op_pool.c  |   14 +-
 .../internal/src/ops/vsi_nn_op_pre_process.c  |   68 +-
 .../src/ops/vsi_nn_op_pre_process_nv12.c      |   10 +-
 .../src/ops/vsi_nn_op_pre_process_yuv420.c    |    8 +-
 .../src/ops/vsi_nn_op_pre_process_yuv422.c    |  238 +
 .../vx/internal/src/ops/vsi_nn_op_reduce.c    |    2 +-
 .../src/ops/vsi_nn_op_reduce_mean_internal.c  |    2 +-
 src/tim/vx/internal/src/ops/vsi_nn_op_relu1.c |   21 +-
 src/tim/vx/internal/src/ops/vsi_nn_op_relu6.c |   22 +-
 src/tim/vx/internal/src/ops/vsi_nn_op_relun.c |   30 +-
 .../vx/internal/src/ops/vsi_nn_op_repeat.c    |   10 +-
 .../vx/internal/src/ops/vsi_nn_op_reshape.c   |    4 +-
 .../vx/internal/src/ops/vsi_nn_op_resize.c    |   10 +
 .../src/ops/vsi_nn_op_rnncell_ovxlib.c        |   21 +-
 .../vx/internal/src/ops/vsi_nn_op_roi_pool.c  |    1 +
 src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c |   21 +-
 .../src/ops/vsi_nn_op_scatter_elements.c      |  171 +
 .../vx/internal/src/ops/vsi_nn_op_select.c    |   47 +-
 .../vx/internal/src/ops/vsi_nn_op_softrelu.c  |   22 +-
 .../internal/src/ops/vsi_nn_op_space2batch.c  |  124 +-
 src/tim/vx/internal/src/ops/vsi_nn_op_sqrt.c  |   22 +-
 .../vx/internal/src/ops/vsi_nn_op_squeeze.c   |    2 +-
 .../vsi_nn_op_unidirectional_sequence_rnn.c   |   34 +-
 .../src/utils/vsi_nn_code_generator.c         |    4 +
 .../src/utils/vsi_nn_constraint_check.c       |    7 +-
 src/tim/vx/internal/src/utils/vsi_nn_dtype.c  |   26 +-
 src/tim/vx/internal/src/utils/vsi_nn_util.c   |   42 +
 src/tim/vx/internal/src/vsi_nn_context.c      |   13 +-
 src/tim/vx/internal/src/vsi_nn_graph.c        |  124 +
 .../internal/src/vsi_nn_graph_optimization.c  |   14 +-
 .../internal/src/vsi_nn_node_attr_template.c  |    2 +
 src/tim/vx/internal/src/vsi_nn_tensor.c       |   49 +-
 153 files changed, 14300 insertions(+), 5067 deletions(-)
 create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_bucketize.h
 create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_lppool.h
 create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv422.h
 create mode 100644 src/tim/vx/internal/include/ops/vsi_nn_op_scatter_elements.h
 create mode 100644 src/tim/vx/internal/src/kernel/cl/bucketize_cl.c
 create mode 100644 src/tim/vx/internal/src/kernel/cl/lppool_cl.c
 create mode 100644 src/tim/vx/internal/src/kernel/cl/scatter_elements_cl.c
 create mode 100644 src/tim/vx/internal/src/kernel/cpu/bucketize_cpu.c
 create mode 100644 src/tim/vx/internal/src/kernel/cpu/lppool_cpu.c
 create mode 100644 src/tim/vx/internal/src/kernel/cpu/pre_process_yuv422_cpu.c
 create mode 100644 src/tim/vx/internal/src/kernel/cpu/scatter_elements_cpu.c
 create mode 100644 src/tim/vx/internal/src/kernel/evis/bucketize_evis.c
 create mode 100644 src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c
 create mode 100644 src/tim/vx/internal/src/kernel/sp/layer_norm_y_direction_sp.c
 create mode 100644 src/tim/vx/internal/src/kernel/sp/softmax_z_direction_sp.c
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/bucketize.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/lppool.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements_add.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements_mul.cl
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/bucketize.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale_8bits.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale_mix.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_0.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_1.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_fp16.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_i16.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_i8.vx
 delete mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_u8.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx
 create mode 100644 src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx
 create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_bucketize.c
 create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_lppool.c
 create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv422.c
 create mode 100644 src/tim/vx/internal/src/ops/vsi_nn_op_scatter_elements.c

diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def
index ae52716..045eb95 100644
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@@ -179,3 +179,7 @@ DEF_OP(SOFTSIGN)
 DEF_OP(CUMSUM)
 DEF_OP(MAXPOOLWITHARGMAX)
 DEF_OP(MOD)
+DEF_OP(LPPOOL)
+DEF_OP(SCATTER_ELEMENTS)
+DEF_OP(PRE_PROCESS_YUV422)
+DEF_OP(BUCKETIZE)
diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
index 7d75720..d2c4e58 100644
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
@@ -326,9 +326,20 @@ const void * vsi_nn_kernel_param_get_const_buffer
             } \
     static vsi_status NAME##_impl
 
+#define DEF_SP_KERNEL_BASE_CALLBACK( NAME )  \
+    static vsi_status NAME##_impl( vsi_nn_kernel_node_t node); \
+    static vx_status VX_CALLBACK NAME( \
+            vx_node node) {\
+                return (vx_status)NAME##_impl( \
+                        (vsi_nn_kernel_node_t)node); \
+            } \
+    static vsi_status NAME##_impl
+
+
 #define DEF_KERNEL_INITIALIZER( NAME )          DEF_KERNEL_BASE_CALLBACK( NAME )
 #define DEF_KERNEL_EXECUTOR( NAME )             DEF_KERNEL_BASE_CALLBACK( NAME )
 #define DEF_KERNEL_DEINITIALIZER( NAME )        DEF_KERNEL_BASE_CALLBACK( NAME )
+#define DEF_SP_KERNEL_QUERY( NAME )             DEF_SP_KERNEL_BASE_CALLBACK( NAME )
 
 void vsi_nn_kernel_backend_register
     (
diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h
index 26a676f..cfecfd1 100644
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h
@@ -85,4 +85,10 @@ vsi_bool vsi_nn_kernel_optimize_group_norm_shape
     int32_t is_sp_kernel, vsi_size_t* out_shape
     );
 
+vsi_bool vsi_nn_kernel_optimize_scatter_elements_shape
+    (
+    const vsi_size_t* shape_x, const vsi_size_t rank_x, const int32_t axis,
+    vsi_size_t* out_shape_x, uint32_t* out_rank_x, int32_t* out_axis, vsi_size_t max_size
+    );
+
 #endif
diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
index c872cca..f413b81 100644
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
@@ -48,6 +48,10 @@ typedef int32_t vsi_nn_kernel_lut_act_e; enum
     VSI_NN_KERNEL_LUT_CELU             = 14,
     VSI_NN_KERNEL_LUT_RCP              = 15,
     VSI_NN_KERNEL_LUT_SOFTSIGN         = 16,
+    VSI_NN_KERNEL_LUT_LINEAR_EXP       = 17,
+    VSI_NN_KERNEL_LUT_LINEAR_RSQRT     = 18,
+    VSI_NN_KERNEL_LUT_LINEAR_SIGMOID   = 19,
+
 };
 
 #define VSI_NN_KERNEL_LUT_MAX_SIZE  (1024)
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_rnn.h b/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_rnn.h
index 2bf8c77..82aa777 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_rnn.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_rnn.h
@@ -38,22 +38,26 @@ enum
 
     BI_RNN_FW_INPUT_WEIGHT_I     = 1,
     BI_RNN_FW_INPUT_WEIGHT_H     = 2,
-    BI_RNN_FW_INPUT_BIAS         = 3,
-    BI_RNN_FW_INPUT_H_STATE      = 4,
+    BI_RNN_FW_INPUT_BIAS_I       = 3,
+    BI_RNN_FW_INPUT_BIAS_H       = 4,
+    BI_RNN_FW_INPUT_H_STATE      = 5,
 
-    BI_RNN_BW_INPUT_WEIGHT_I     = 5,
-    BI_RNN_BW_INPUT_WEIGHT_H     = 6,
-    BI_RNN_BW_INPUT_BIAS         = 7,
-    BI_RNN_BW_INPUT_H_STATE      = 8,
+    BI_RNN_BW_INPUT_WEIGHT_I     = 6,
+    BI_RNN_BW_INPUT_WEIGHT_H     = 7,
+    BI_RNN_BW_INPUT_BIAS_I       = 8,
+    BI_RNN_BW_INPUT_BIAS_H       = 9,
+    BI_RNN_BW_INPUT_H_STATE      = 10,
 
-    BI_RNN_AUX_INPUT             = 9,
-    BI_RNN_FW_AUX_INPUT_WEIGHT   = 10,
-    BI_RNN_BW_AUX_INPUT_WEIGHT   = 11,
+    BI_RNN_AUX_INPUT             = 11,
+    BI_RNN_FW_AUX_INPUT_WEIGHT   = 12,
+    BI_RNN_BW_AUX_INPUT_WEIGHT   = 13,
 
     BI_RNN_INPUT_CNT,
 
-    BI_RNN_FW_OUTPUT_OUTPUT      = 0,
-    BI_RNN_BW_OUTPUT_OUTPUT      = 1,
+    BI_RNN_FW_OUTPUT_H_STATE     = 0,
+    BI_RNN_BW_OUTPUT_H_STATE     = 1,
+    BI_RNN_FW_OUTPUT_OUTPUT      = 2,
+    BI_RNN_BW_OUTPUT_OUTPUT      = 3,
     BI_RNN_OUTPUT_CNT
 };
 
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_bucketize.h b/src/tim/vx/internal/include/ops/vsi_nn_op_bucketize.h
new file mode 100644
index 0000000..501b117
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_bucketize.h
@@ -0,0 +1,48 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_BUCKETIZE_H
+#define _VSI_NN_OP_BUCKETIZE_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_bucketize_param
+{
+    struct _bucketize_local_data_t* local;
+    // Add parameters here
+    vsi_bool right;
+} vsi_nn_bucketize_param;
+_compiler_assert(offsetof(vsi_nn_bucketize_param, local) == 0, \
+    vsi_nn_bucketize_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_conv1d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_conv1d.h
index 5fa5041..504d984 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_conv1d.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_conv1d.h
@@ -54,6 +54,7 @@ typedef struct _vsi_nn_conv1d_param
     uint32_t     group;
     uint32_t     dilation;
     int32_t      multiplier;
+    vsi_nn_pad_mode_e pad_mode;
 } vsi_nn_conv1d_param;
 _compiler_assert(offsetof(vsi_nn_conv1d_param, local) == 0, \
     vsi_nn_vsi_nn_conv1d_h );
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d.h
index 282c988..55f882b 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d.h
@@ -30,6 +30,20 @@
 extern "C" {
 #endif
 
+typedef struct _vsi_nn_conv2d_param_deprecate
+{
+    uint32_t     ksize[2];
+    uint32_t     stride[2];
+    /* Pad left, right, top, bottom */
+    uint32_t     pad[4];
+    /* Pad type default value shall be AUTO */
+    vsi_nn_pad_e pad_type;
+    uint32_t     weights;
+    uint32_t     group;
+    uint32_t     dilation[2];
+    int32_t      multiplier;
+} vsi_nn_conv2d_param_deprecate;
+
 typedef struct _vsi_nn_conv2d_param
 {
     uint32_t     ksize[2];
@@ -42,6 +56,7 @@ typedef struct _vsi_nn_conv2d_param
     uint32_t     group;
     uint32_t     dilation[2];
     int32_t      multiplier;
+    vsi_nn_pad_mode_e pad_mode;
 } vsi_nn_conv2d_param;
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_conv3d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_conv3d.h
index bf8bf2b..590eaa4 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_conv3d.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_conv3d.h
@@ -47,6 +47,7 @@ typedef struct _vsi_nn_conv3d_param
     int32_t     weights;
 
     int32_t      multiplier;
+    vsi_nn_pad_mode_e pad_mode;
 } vsi_nn_conv3d_param;
 _compiler_assert(offsetof(vsi_nn_conv3d_param, local) == 0, \
     vsi_nn_conv3d_h );
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_deconv3d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_deconv3d.h
index 133267f..923fa7f 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_deconv3d.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_deconv3d.h
@@ -43,6 +43,7 @@ typedef struct _vsi_nn_deconv3d_param
     uint32_t   weights;
     uint32_t   group;
     uint32_t   output_padding[3];
+    vsi_nn_pad_mode_e pad_mode;
 } vsi_nn_deconv3d_param;
 _compiler_assert(offsetof(vsi_nn_deconv3d_param, local) == 0, \
     vsi_nn_deconv3d_h );
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_depthwise_conv1d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_depthwise_conv1d.h
index 7f7f66f..f3d03a7 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_depthwise_conv1d.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_depthwise_conv1d.h
@@ -36,6 +36,7 @@ typedef struct _vsi_nn_depthwise_conv1d_param
     uint32_t     pad[2];
     uint32_t     dilation;
     int32_t      multiplier;
+    vsi_nn_pad_mode_e pad_mode;
 } vsi_nn_depthwise_conv1d_param;
 
 __END_DECLS
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h
index fa571e9..d23c10b 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h
@@ -51,6 +51,7 @@ typedef struct _vsi_nn_grouped_conv1d_param
     uint32_t     group;
     uint32_t     dilation;
     int32_t      multiplier;
+    vsi_nn_pad_mode_e pad_mode;
 } vsi_nn_grouped_conv1d_param;
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv2d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv2d.h
index 59858c0..f78b8ea 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv2d.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv2d.h
@@ -43,6 +43,7 @@ typedef struct _vsi_nn_grouped_conv2d_param
     uint32_t     dilation[2];
     int32_t      multiplier;
     void* local;
+    vsi_nn_pad_mode_e pad_mode;
 } vsi_nn_grouped_conv2d_param;
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_l2normalizescale.h b/src/tim/vx/internal/include/ops/vsi_nn_op_l2normalizescale.h
index b15ee4e..e6fe704 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_l2normalizescale.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_l2normalizescale.h
@@ -86,7 +86,7 @@ typedef struct _vsi_nn_l2normalizescale_lcl_data
 {
     vx_tensor   local_tensor[_VSI_NN_L2NORMALIZESCALE_LOCAL_TENSOR_NUM];
     uint32_t    hash_idx;
-    vsi_bool    execute_on_sw;
+    vsi_bool    use_internal_node;
 } vsi_nn_l2normalizescale_lcl_data;
 
 typedef struct _vsi_nn_l2normalizescale_param
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_layernormalize.h b/src/tim/vx/internal/include/ops/vsi_nn_op_layernormalize.h
index 91501bb..cef6647 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_layernormalize.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_layernormalize.h
@@ -35,14 +35,20 @@ extern "C" {
 
 typedef struct _vsi_nn_layernorm_lcl_data
 {
-    vx_tensor   local_tensor[_VSI_NN_LAYERNORM_LOCAL_TENSOR_NUM];
+    vsi_bool use_internal_node;
 } vsi_nn_layernorm_lcl_data;
 
 typedef struct _vsi_nn_layernormalize_param
 {
     /* local data must be the first. */
-    vsi_nn_layernorm_lcl_data local;
+    union
+    {
+        vx_tensor local_tensor[_VSI_NN_LAYERNORM_LOCAL_TENSOR_NUM];
+        vsi_nn_layernorm_lcl_data *local;
+    };
+
     float eps;
+    int32_t axis;
 } vsi_nn_layernormalize_param;
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_lppool.h b/src/tim/vx/internal/include/ops/vsi_nn_op_lppool.h
new file mode 100644
index 0000000..84a8f95
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lppool.h
@@ -0,0 +1,46 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_LPPOOL_H
+#define _VSI_NN_OP_LPPOOL_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_lppool_param {
+    vsi_nn_pad_e pad_type;
+    uint32_t ksize[2];
+    int32_t p;
+    uint32_t pad[4];
+    uint32_t stride[2];
+} vsi_nn_lppool_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pad.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pad.h
index 7e7d5d1..91ef2c4 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pad.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pad.h
@@ -30,13 +30,6 @@
 extern "C" {
 #endif
 
-typedef enum {
-    VSI_NN_PAD_MODE_CONSTANT,
-    VSI_NN_PAD_MODE_REPLICATE,
-    VSI_NN_PAD_MODE_SYMMETRIC,
-    VSI_NN_PAD_MODE_REFLECT,
-}vsi_nn_pad_mode_e;
-
 typedef struct _vsi_nn_pad_param
 {
     const uint32_t * front_size;
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv422.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv422.h
new file mode 100644
index 0000000..b516e60
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv422.h
@@ -0,0 +1,81 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_PRE_PROCESS_YUV422_H
+#define _VSI_NN_OP_PRE_PROCESS_YUV422_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define _VSI_NN_PRE_PROCESS_YUV422_LOCAL_TENSOR_NUM 2
+
+typedef struct _vsi_nn_pre_process_yuv422_lcl_data
+{
+    int32_t scale_x;
+    int32_t scale_y;
+    vsi_bool enable_copy;
+    vsi_bool enable_perm;
+    vx_tensor   local_tensor[_VSI_NN_PRE_PROCESS_YUV422_LOCAL_TENSOR_NUM];
+} vsi_nn_pre_process_yuv422_lcl_data;
+
+typedef struct _vsi_nn_pre_process_yuv422_param
+{
+    vsi_nn_pre_process_yuv422_lcl_data* local;
+
+    vsi_nn_yuv_type yuv422_type;
+
+    struct
+    {
+        uint32_t left;
+        uint32_t top;
+        uint32_t width;
+        uint32_t height;
+    } rect;
+
+    struct
+    {
+        vsi_size_t   *size;
+        uint32_t   dim_num;
+    } output_attr;
+
+    uint32_t * perm;
+    uint32_t   dim_num;
+
+    float r_mean;
+    float g_mean;
+    float b_mean;
+    float rgb_scale;
+
+    vsi_bool reverse_channel;
+} vsi_nn_pre_process_yuv422_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_rnn.h b/src/tim/vx/internal/include/ops/vsi_nn_op_rnn.h
index 0083c78..3e50d0a 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_rnn.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_rnn.h
@@ -37,11 +37,12 @@ enum
     RNNCELL_INPUT_INPUT        = 0,
     RNNCELL_INPUT_WEIGHT_I     = 1,
     RNNCELL_INPUT_WEIGHT_H     = 2,
-    RNNCELL_INPUT_BIAS         = 3,
-    RNNCELL_INPUT_H_STATE      = 4,
+    RNNCELL_INPUT_BIAS_I       = 3,
+    RNNCELL_INPUT_BIAS_H       = 4,
+    RNNCELL_INPUT_H_STATE      = 5,
 
-    RNNCELL_INPUT_AUX_INPUT    = 5,
-    RNNCELL_INPUT_AUX_WEIGHT   = 6,
+    RNNCELL_INPUT_AUX_INPUT    = 6,
+    RNNCELL_INPUT_AUX_WEIGHT   = 7,
     RNNCELL_INPUT_CNT,
 
     RNNCELL_OUTPUT_H_STATE     = 0,
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_elements.h b/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_elements.h
new file mode 100644
index 0000000..c12fc85
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_scatter_elements.h
@@ -0,0 +1,49 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_SCATTER_ELEMENTS_H
+#define _VSI_NN_OP_SCATTER_ELEMENTS_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_scatter_elements_param
+{
+    struct _scatter_elements_local_data_t* local;
+    // Add parameters here
+    int32_t axis;
+    vsi_nn_reduction_type_e reduction;
+} vsi_nn_scatter_elements_param;
+_compiler_assert(offsetof(vsi_nn_scatter_elements_param, local) == 0, \
+    vsi_nn_scatter_elements_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_unidirectional_sequence_rnn.h b/src/tim/vx/internal/include/ops/vsi_nn_op_unidirectional_sequence_rnn.h
index 985fe22..bf87649 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_unidirectional_sequence_rnn.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_unidirectional_sequence_rnn.h
@@ -37,11 +37,13 @@ enum
     RNN_INPUT_INPUT        = 0,
     RNN_INPUT_WEIGHT_I     = 1,
     RNN_INPUT_WEIGHT_H     = 2,
-    RNN_INPUT_BIAS         = 3,
-    RNN_INPUT_H_STATE      = 4,
+    RNN_INPUT_BIAS_I       = 3,
+    RNN_INPUT_BIAS_H       = 4,
+    RNN_INPUT_H_STATE      = 5,
     RNN_INPUT_CNT,
 
-    RNN_OUTPUT_OUTPUT      = 0,
+    RNN_OUTPUT_H_STATE      = 0,
+    RNN_OUTPUT_OUTPUT      = 1,
     RNN_OUTPUT_CNT
 };
 
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
index 4e19fc0..7eaec28 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
@@ -253,11 +253,11 @@ static VSI_INLINE_API int32_t fp32_to_dfp
     type_get_range( type, &max_range, &min_range );
     if( fl > 0 )
     {
-        data = (int32_t)vsi_rint( in * (float)( (int64_t)1 << fl ) );
+        data = (int32_t)vsi_rint( in * (double)( (int64_t)1 << fl ) );
     }
     else
     {
-        data = (int32_t)vsi_rint( in * ( 1.0f / (float)( (int64_t)1 << -fl ) ) );
+        data = (int32_t)vsi_rint( in * ( 1.0f / (double)( (int64_t)1 << -fl ) ) );
     }
     data = vsi_nn_min( data, (int32_t)max_range );
     data = vsi_nn_max( data, (int32_t)min_range );
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_util.h b/src/tim/vx/internal/include/utils/vsi_nn_util.h
index 77b3cb6..f939592 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_util.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h
@@ -468,6 +468,16 @@ FILE* vsi_nn_fopen
     const char * mode
     );
 
+int32_t vsi_nn_get_vx_pad_mode
+    (
+    vsi_nn_pad_mode_e mode
+    );
+
+vsi_bool vsi_nn_is_3d_tensor
+    (
+    vsi_nn_tensor_t * tensor
+    );
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/tim/vx/internal/include/vip/virtual_device.h b/src/tim/vx/internal/include/vip/virtual_device.h
index a314a86..a91ef83 100644
--- a/src/tim/vx/internal/include/vip/virtual_device.h
+++ b/src/tim/vx/internal/include/vip/virtual_device.h
@@ -27,6 +27,8 @@
 #include <memory>
 #include <functional>
 
+#include "vsi_nn_pub.h"
+
 struct _vsi_nn_graph;
 typedef struct _vsi_nn_graph vsi_nn_graph_t;
 
@@ -38,13 +40,13 @@ using data_t = const void*;
 
 class IDevice {
     public:
-        IDevice(uint32_t id);
-        ~IDevice();
-        uint32_t Id() const;
-        bool GraphSubmit(vsi_nn_graph_t* graph, func_t func, data_t data);
-        bool GraphRemove(const vsi_nn_graph_t* graph);
-        bool ThreadExit();
-        void WaitThreadIdle();
+        OVXLIB_API IDevice(uint32_t id);
+        OVXLIB_API ~IDevice();
+        OVXLIB_API uint32_t Id() const;
+        OVXLIB_API bool GraphSubmit(vsi_nn_graph_t* graph, func_t func, data_t data);
+        OVXLIB_API bool GraphRemove(const vsi_nn_graph_t* graph);
+        OVXLIB_API bool ThreadExit();
+        OVXLIB_API void WaitThreadIdle();
 
     protected:
         Device* device_;
@@ -52,4 +54,4 @@ class IDevice {
 
 }  // namespace vip
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h
index 95591ca..f5ace92 100644
--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@@ -76,6 +76,7 @@ typedef struct _vsi_nn_runtime_option_t
     int32_t enable_opcheck;
     int32_t enable_concat_optimize;
     int32_t enable_asymi8_to_u8;
+    int32_t enable_dataconvert_optimize;
 } vsi_nn_runtime_option_t;
 
 /**
diff --git a/src/tim/vx/internal/include/vsi_nn_graph.h b/src/tim/vx/internal/include/vsi_nn_graph.h
index dda35b7..c9c0687 100644
--- a/src/tim/vx/internal/include/vsi_nn_graph.h
+++ b/src/tim/vx/internal/include/vsi_nn_graph.h
@@ -751,6 +751,20 @@ OVXLIB_API vsi_bool vsi_nn_IsGraphFastMode
     (
     const vsi_nn_graph_t* graph
     );
+
+OVXLIB_API vsi_status vsi_nn_CopyTensorViaGraphs
+    (
+    vsi_nn_graph_t *src_graph,
+    vsi_nn_tensor_id_t src_tensor_id,
+    vsi_nn_graph_t *dst_graph,
+    vsi_nn_tensor_id_t dst_tensor_id
+    );
+
+OVXLIB_API vsi_status vsi_nn_ExecuteGraphLoop
+    (
+    vsi_nn_graph_t* graph,
+    vsi_nn_tensor_t *max_iteration_tensor
+    );
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h
index 5c170df..d41e0f0 100644
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@@ -196,6 +196,10 @@
 #include "ops/vsi_nn_op_softsign.h"
 #include "ops/vsi_nn_op_cumsum.h"
 #include "ops/vsi_nn_op_mod.h"
+#include "ops/vsi_nn_op_lppool.h"
+#include "ops/vsi_nn_op_scatter_elements.h"
+#include "ops/vsi_nn_op_pre_process_yuv422.h"
+#include "ops/vsi_nn_op_bucketize.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"
 
@@ -206,9 +210,10 @@ extern "C"{
 /** Operation attributes */
 typedef union _vsi_nn_nn_param
 {
+    vsi_nn_conv2d_param        conv2d;
     struct
     {
-        vsi_nn_conv2d_param         conv2d;
+        vsi_nn_conv2d_param_deprecate         conv2d_deprecate;
         vsi_nn_pool_param           pool;
     };
     vsi_nn_fcl_param                fcl;
@@ -377,6 +382,10 @@ typedef union _vsi_nn_nn_param
     vsi_nn_softsign_param           softsign;
     vsi_nn_cumsum_param             cumsum;
     vsi_nn_mod_param                mod;
+    vsi_nn_lppool_param             lppool;
+    vsi_nn_scatter_elements_param   scatter_elements;
+    vsi_nn_pre_process_yuv422_param pre_process_yuv422;
+    vsi_nn_bucketize_param          bucketize;
     void*                         client_param;
 
     /* custom node data struct define */
diff --git a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
index 5cc2a3e..5da4b82 100644
--- a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
+++ b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h
@@ -85,6 +85,8 @@ typedef enum
     VSI_NN_SOURCE_FORMAT_IMAGE_YUV444,
     VSI_NN_SOURCE_FORMAT_IMAGE_NV12,
     VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP,
+    VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422,
+    VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422,
 } vsi_nn_preprocess_source_format_e;
 
 /**
diff --git a/src/tim/vx/internal/include/vsi_nn_types.h b/src/tim/vx/internal/include/vsi_nn_types.h
index 076f493..0a655c1 100644
--- a/src/tim/vx/internal/include/vsi_nn_types.h
+++ b/src/tim/vx/internal/include/vsi_nn_types.h
@@ -111,6 +111,22 @@ typedef enum
     VSI_NN_PAD_SAME
 } vsi_nn_pad_e;
 
+/** reduce type enum */
+typedef enum
+{
+    VSI_NN_REDUCTION_TYPE_NONE,
+    VSI_NN_REDUCTION_TYPE_ADD,
+    VSI_NN_REDUCTION_TYPE_MUL
+} vsi_nn_reduction_type_e;
+
+/** Pad mode enum */
+typedef enum {
+    VSI_NN_PAD_MODE_CONSTANT,
+    VSI_NN_PAD_MODE_REPLICATE,
+    VSI_NN_PAD_MODE_SYMMETRIC,
+    VSI_NN_PAD_MODE_REFLECT,
+} vsi_nn_pad_mode_e;
+
 /**
  * @deprecated  Platform enum
  * @see vsi_nn_dim_fmt_e
@@ -235,6 +251,12 @@ typedef enum _vsi_nn_con2d_lstm_dataformat
     CONV2D_LSTM_CHANNELS_FIRST
 } vsi_nn_con2d_lstm_dataformat;
 
+typedef enum _vsi_nn_yuv_type
+{
+    VSI_NN_YUV_TYPE_YUYV422,
+    VSI_NN_YUV_TYPE_UYUV422
+}vsi_nn_yuv_type;
+
 /** Deprecated */
 typedef uint32_t vsi_nn_size_t;
 
diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h
index 711c498..5079bfe 100644
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@@ -33,7 +33,7 @@ extern "C"{
 
 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 1
-#define VSI_NN_VERSION_PATCH 50
+#define VSI_NN_VERSION_PATCH 57
 #define VSI_NN_VERSION \
     (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
 
diff --git a/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c b/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c
index 31a5223..c62f0b4 100644
--- a/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c
@@ -56,7 +56,7 @@ __BEGIN_DECLS
         VSI_NN_GEN_BATCH_NORM_KERNEL_SOURCE_NAME },
 
 #define HASH_BATCH_NORM_SH_KERNEL_2D_NAME( SRC_TYPE, DST_TYPE) \
-    CVIVANTE_NAMESPACE("batch_norm_"#SRC_TYPE"to"#DST_TYPE"_2D")
+    CVIVANTE_NAMESPACE("cl.batch_norm_"#SRC_TYPE"to"#DST_TYPE"_2D")
 
 #define TENSOR_BATCH_NORM_KERNELS_2D( SRC_TYPE, OUT_TYPE) \
     {   HASH_BATCH_NORM_KEY( SRC_TYPE, OUT_TYPE, 1), \
diff --git a/src/tim/vx/internal/src/kernel/cl/bucketize_cl.c b/src/tim/vx/internal/src/kernel/cl/bucketize_cl.c
new file mode 100644
index 0000000..e20cb1b
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/bucketize_cl.c
@@ -0,0 +1,303 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    INTERNAL_KERNEL_BUCKETIZE,
+} _internal_kernel_e;
+
+#define STR(a) #a
+
+// Add kernel hashtable here
+#define BUCKETIZE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ) \
+        (( IN0_DTYPE ) | ( IN1_DTYPE << 8 ) | ( OUT_DTYPE << 16 ) | (RIGHT << 24) | (IMG_2D << 25))
+
+#define PACK_KERNEL_3D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ) \
+        { BUCKETIZE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ), \
+        CVIVANTE_NAMESPACE("cl.bucketize_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
+        "bucketize" }
+#define PACK_KERNEL_2D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ) \
+        { BUCKETIZE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ), \
+        CVIVANTE_NAMESPACE("cl.bucketize_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \
+        "bucketize" }
+#define PACK_KERNEL_RIGHT_3D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ) \
+        { BUCKETIZE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ), \
+        CVIVANTE_NAMESPACE("cl.bucketize_right_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)), \
+        "bucketize" }
+#define PACK_KERNEL_RIGHT_2D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ) \
+        { BUCKETIZE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ), \
+        CVIVANTE_NAMESPACE("cl.bucketize_right_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \
+        "bucketize" }
+
+#define PACK_KERNEL_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE ) \
+    PACK_KERNEL_3D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0, 0 ), \
+    PACK_KERNEL_2D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0, 1 ), \
+    PACK_KERNEL_RIGHT_3D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1, 0 ), \
+    PACK_KERNEL_RIGHT_2D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1, 1 ),
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _bucketize_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( F32,  F32,  I32 )
+    PACK_KERNEL_MAP( I32,  I32,  I32 )
+    PACK_KERNEL_MAP( U32,  U32,  I32 )
+    PACK_KERNEL_MAP( BF16, BF16, I32 )
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _bucketize_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _BUCKETIZE_PARAM_NUM  _cnt_of_array( _bucketize_kernel_param_def )
+#define SCALAR_BOUNDARIES_VALUE         (3)
+#define SCALAR_SCALE0_VALUE             (4)
+#define SCALAR_TAIL0_VALUE              (5)
+#define SCALAR_SCALE1_VALUE             (6)
+#define SCALAR_TAIL1_VALUE              (7)
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_bucketize_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
+    vsi_size_array_t * out_shape                = NULL;
+
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    out_shape  = output_attr->shape;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3;
+    gpu_param.global_size[0] = gpu_align_p2(
+            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(output_attr);
+#undef SAFE_FREE_TENSOR_ATTR
+    return status;
+} /* _bucketize_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t right,
+    vsi_bool is_img2d
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype;
+    vsi_nn_kernel_dtype_e in1_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _bucketize_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _bucketize_kernel_map );
+    vx_param_description_t * param_def  = _bucketize_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _bucketize_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in0_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+#define _PACK_SELECT_KEY( in0_dtype, in1_dtype ) \
+    ( ( in0_dtype ) | ( in1_dtype << 8 ))
+
+    switch (_PACK_SELECT_KEY(in0_dtype, in1_dtype))
+    {
+    case _PACK_SELECT_KEY(F32, F32):
+    case _PACK_SELECT_KEY(F16, F16):
+        key = BUCKETIZE_HASH_KEY( F32, F32, out_dtype, right, is_img2d );
+        break;
+    case _PACK_SELECT_KEY(I8,  I8):
+    case _PACK_SELECT_KEY(I16, I16):
+    case _PACK_SELECT_KEY(I32, I32):
+        key = BUCKETIZE_HASH_KEY( I32, I32, out_dtype, right, is_img2d );
+        break;
+    case _PACK_SELECT_KEY(U8,  U8):
+    case _PACK_SELECT_KEY(U16, U16):
+    case _PACK_SELECT_KEY(U32, U32):
+        key = BUCKETIZE_HASH_KEY( U32, U32, out_dtype, right, is_img2d );
+        break;
+    default:
+        key = BUCKETIZE_HASH_KEY( in0_dtype, in1_dtype, out_dtype, right, is_img2d );
+        break;
+    }
+#undef _PACK_SELECT_KEY
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _bucketize_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_BUCKETIZE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    float input0_scale= vsi_nn_get_tensor_scale(inputs[0]);
+    float input0_tail = -input0_scale * (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float input1_scale= vsi_nn_get_tensor_scale(inputs[1]);
+    float input1_tail = -input0_scale * (float)vsi_nn_get_tensor_zero_point(inputs[1]);
+    int32_t boundaries_size = (int32_t)inputs[1]->attr.size[0];
+    vsi_bool image_2d = FALSE;
+    int32_t right = vsi_nn_kernel_param_get_int32( params, "right" );
+
+    if( !vsi_nn_kernel_gpu_check_shape(
+        inputs[0]->attr.size, inputs[0]->attr.dim_num ) ||
+        boundaries_size >= GPU_TENSOR_MAX_WIDTH )
+    {
+        return NULL;
+    }
+
+    image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
+
+    status = _query_kernel( kernel, inputs, outputs, right, image_2d );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _BUCKETIZE_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            node_params[SCALAR_BOUNDARIES_VALUE] = vsi_nn_kernel_scalar_create( graph, I32, &boundaries_size );
+            node_params[SCALAR_SCALE0_VALUE]  = vsi_nn_kernel_scalar_create( graph, F32, &input0_scale );
+            node_params[SCALAR_TAIL0_VALUE]   = vsi_nn_kernel_scalar_create(graph, F32, &input0_tail );
+            node_params[SCALAR_SCALE1_VALUE] = vsi_nn_kernel_scalar_create( graph, F32, &input1_scale );
+            node_params[SCALAR_TAIL1_VALUE]  = vsi_nn_kernel_scalar_create(graph, F32, &input1_tail );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _BUCKETIZE_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_BOUNDARIES_VALUE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE0_VALUE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL0_VALUE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE1_VALUE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL1_VALUE] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( bucketize, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cl/gather_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_cl.c
index f04c62f..66eb842 100644
--- a/src/tim/vx/internal/src/kernel/cl/gather_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c
@@ -252,6 +252,16 @@ static vsi_status _query_kernel
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
+    if (input0_dtype == I8)
+    {
+        input0_dtype = I32;
+    }
+
+    if (output_dtype == I8)
+    {
+        output_dtype = I32;
+    }
+
     key = HASH_GATHER_KEY( input0_dtype, I32, output_dtype, 0, is_batch );
 
     for ( i = 0; i < _cnt_of_array(gather_map); i ++ )
diff --git a/src/tim/vx/internal/src/kernel/cl/lppool_cl.c b/src/tim/vx/internal/src/kernel/cl/lppool_cl.c
new file mode 100644
index 0000000..514bec0
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/lppool_cl.c
@@ -0,0 +1,332 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    INTERNAL_KERNEL_LPPOOL,
+} _internal_kernel_e;
+
+#define _LPPOOL_KERNEL_SOURCE_NAME      "lppool"
+
+// Add kernel hashtable here
+#define LPPOOL_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
+        (( IN_DTYPE << 8 ) | ( OUT_DTYPE ))
+#define LPPOOL_KERNELS( IN_DTYPE, OUT_DTYPE ) \
+        { LPPOOL_HASH_KEY( IN_DTYPE, OUT_DTYPE ), \
+        CVIVANTE_NAMESPACE("cl.lppool_"#IN_DTYPE"to"#OUT_DTYPE), \
+        _LPPOOL_KERNEL_SOURCE_NAME }, \
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _lppool_kernel_map[] =
+{
+    // Register kernel here
+    LPPOOL_KERNELS( F32, F32 )
+    LPPOOL_KERNELS( F32, U32 )
+    LPPOOL_KERNELS( F32, I32 )
+    LPPOOL_KERNELS( U32, U32 )
+    LPPOOL_KERNELS( U32, F32 )
+    LPPOOL_KERNELS( I32, I32 )
+    LPPOOL_KERNELS( I32, F32 )
+    LPPOOL_KERNELS( BF16, BF16 )
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _lppool_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _LPPOOL_PARAM_NUM  _cnt_of_array( _lppool_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_lppool_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_status status = VSI_FAILURE;
+    vx_tensor  output = (vx_tensor)param[1];
+    vsi_nn_kernel_tensor_attr_t *output_attr  = NULL;
+    vsi_size_array_t            *output_shape = NULL;
+
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output );
+    CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final );
+
+    output_shape = output_attr->shape;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+    gpu_param.global_size[0]   = (output_shape->data[0] +  gpu_param.global_scale[0] - 1)
+                                        /  gpu_param.global_scale[0];
+    gpu_param.global_size[1]   = (output_shape->data[1] +  gpu_param.global_scale[1] - 1)
+                                        /  gpu_param.global_scale[1];
+    gpu_param.global_size[2]   = (output_shape->data[2] +  gpu_param.global_scale[2] - 1)
+                                        /  gpu_param.global_scale[2];
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (output_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release(&output_attr);
+    }
+
+    return status;
+} /* _lppool_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _lppool_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _lppool_kernel_map );
+    vx_param_description_t * param_def  = _lppool_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _lppool_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+#define _PACK_SELECT_KEY( in_dtype, out_dtype ) \
+     (( in_dtype ) | (out_dtype << 8 ))
+    switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
+    {
+    case _PACK_SELECT_KEY(F32, F32):
+    case _PACK_SELECT_KEY(F16, F16):
+    case _PACK_SELECT_KEY(F32, F16):
+    case _PACK_SELECT_KEY(F16, F32):
+         key = LPPOOL_HASH_KEY( F32, F32);
+         break;
+    case _PACK_SELECT_KEY(F32, U8):
+    case _PACK_SELECT_KEY(F16, U8):
+         key = LPPOOL_HASH_KEY( F32, U32);
+         break;
+    case _PACK_SELECT_KEY(F32, I8):
+    case _PACK_SELECT_KEY(F32, I16):
+    case _PACK_SELECT_KEY(F16, I8):
+    case _PACK_SELECT_KEY(F16, I16):
+         key = LPPOOL_HASH_KEY( F32, I32);
+         break;
+    case _PACK_SELECT_KEY(U8, U8):
+         key = LPPOOL_HASH_KEY( U32, U32);
+         break;
+    case _PACK_SELECT_KEY(U8, F16):
+    case _PACK_SELECT_KEY(U8, F32):
+         key = LPPOOL_HASH_KEY( U32, F32);
+         break;
+    case _PACK_SELECT_KEY(I8, I8):
+    case _PACK_SELECT_KEY(I8, I16):
+    case _PACK_SELECT_KEY(I16, I8):
+    case _PACK_SELECT_KEY(I16, I16):
+         key = LPPOOL_HASH_KEY( I32, I32);
+         break;
+    case _PACK_SELECT_KEY(I8, F16):
+    case _PACK_SELECT_KEY(I8, F32):
+    case _PACK_SELECT_KEY(I16, F16):
+    case _PACK_SELECT_KEY(I16, F32):
+         key = LPPOOL_HASH_KEY( I32, F32);
+         break;
+    default:
+         key = LPPOOL_HASH_KEY( in_dtype, out_dtype);
+         break;
+    }
+#undef _PACK_SELECT_KEY
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _lppool_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_LPPOOL_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t ksize_x  = vsi_nn_kernel_param_get_int32(params, "ksize_x");
+    int32_t ksize_y  = vsi_nn_kernel_param_get_int32(params, "ksize_y");
+    int32_t stride_x = vsi_nn_kernel_param_get_int32(params, "stride_x");
+    int32_t stride_y = vsi_nn_kernel_param_get_int32(params, "stride_y");
+    int32_t pad_left = vsi_nn_kernel_param_get_int32(params, "pad_left");
+    int32_t pad_top  = vsi_nn_kernel_param_get_int32(params, "pad_top");
+    int32_t p        = vsi_nn_kernel_param_get_int32(params, "p");
+    int32_t width    = (int32_t)inputs[0]->attr.size[0];
+    int32_t height   = (int32_t)inputs[0]->attr.size[1];
+    float   outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float   outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   inputScale   = vsi_nn_get_tensor_scale(inputs[0]);
+    float   inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+
+    if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size,
+                inputs[0]->attr.dim_num )
+     || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ))
+    {
+        return NULL;
+    }
+
+    outputScale = 1.0f / outputScale;
+    inputTail   = -(inputTail * inputScale);
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            uint32_t index = 2;
+            vsi_nn_kernel_node_pack_io( node_params, _LPPOOL_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_left );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_top );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &p );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &width );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &inputTail );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &outputTail );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _LPPOOL_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+            vsi_nn_kernel_scalar_release( &node_params[11] );
+            vsi_nn_kernel_scalar_release( &node_params[12] );
+            vsi_nn_kernel_scalar_release( &node_params[13] );
+            vsi_nn_kernel_scalar_release( &node_params[14] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( lppool, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c
index c692265..c81289e 100644
--- a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c
@@ -250,7 +250,7 @@ static vsi_nn_kernel_node_t _setup
     float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
     float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale;
     float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
-    float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f;
+    float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
 
     outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
 
diff --git a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c
index e5fe695..92a19a3 100644
--- a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c
@@ -250,7 +250,7 @@ static vsi_nn_kernel_node_t _setup
     float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
     float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale;
     float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
-    float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f;
+    float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
 
     outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
 
diff --git a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
index c36851e..d82816c 100644
--- a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c
@@ -87,6 +87,7 @@ static vx_param_description_t _roi_align_kernel_param_def[] =
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 #define _ROI_ALIGN_PARAM_NUM  _cnt_of_array( _roi_align_kernel_param_def )
 
@@ -103,8 +104,9 @@ static vx_param_description_t _roi_align_kernel_param_def[] =
 #define SCALAR_SAMPLING_X_RATIO         (14)
 #define SCALAR_SAMPLING_Y_RATIO         (15)
 #define SCALAR_DEPTH                    (16)
+#define SCALAR_FORMAT                   (17)
 
-#define ROI_ALIGN_PARAM_NUM         17
+#define ROI_ALIGN_PARAM_NUM         18
 #define ROI_ALIGN_QUANT_PARAM_NUM   _cnt_of_array( _roi_align_kernel_param_def )
 
 /*
@@ -143,12 +145,8 @@ DEF_KERNEL_INITIALIZER(_roi_align_initializer)
     gpu_param.global_scale[2]  = 1;
 
     gpu_param.dim = 3;
-    gpu_param.global_size[0] = gpu_align_p2(
-            (out_shape->data[0] + gpu_param.global_scale[0] - 1)
-            / gpu_param.global_scale[0], 4);
-    gpu_param.global_size[1] = (
-            (out_shape->data[1] + gpu_param.global_scale[1] - 1)
-            / gpu_param.global_scale[1]);
+    gpu_param.global_size[0] = out_shape->data[0];
+    gpu_param.global_size[1] = out_shape->data[1];
     gpu_param.global_size[2] = rois_shape->data[1];
     status = vsi_nn_kernel_gpu_config( node, &gpu_param );
 
@@ -213,7 +211,8 @@ static vsi_status _query_kernel
         kernel->info.numParams   = (uint32_t)param_def_size;
         kernel->info.initialize  = initializer;
         // Register code source
-        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
                 kernel_map[i].source_name );
         // Register binary source
         vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
@@ -259,8 +258,8 @@ static vsi_nn_kernel_node_t _setup
     float   output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
     float   width_scale         = roi_scale / width_ratio;
     float   height_scale        = roi_scale / height_ratio;
-    float   in_width            = (float)(inputs[0]->attr.size[0]);
-    float   in_height           = (float)(inputs[0]->attr.size[1]);
+    int32_t in_width            = (int32_t)(inputs[0]->attr.size[0]);
+    int32_t in_height           = (int32_t)(inputs[0]->attr.size[1]);
     float   rcp_of_out_width    = 1.0f / (float)(outputs[0]->attr.size[0]);
     float   rcp_of_out_height   = 1.0f / (float)(outputs[0]->attr.size[1]);
     float   sampling_x_ratio    = width_sample_num > 0 ? (float)width_sample_num : 0;
@@ -294,6 +293,8 @@ static vsi_nn_kernel_node_t _setup
 
     if ( VSI_SUCCESS == status )
     {
+        int32_t out_dtype = (int32_t)vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+        int32_t dtype = out_dtype == F16 ? 1 : out_dtype == F32 ? 2 : 0;
         size_t node_params_num = ROI_ALIGN_PARAM_NUM;
 
         node = vsi_nn_kernel_create_node( graph, kernel );
@@ -309,13 +310,14 @@ static vsi_nn_kernel_node_t _setup
             node_params[SCALAR_OUTPUT_ZP]            = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
             node_params[SCALAR_SPATIAL_X_SCALE]      = vsi_nn_kernel_scalar_create( graph, F32, &width_scale );
             node_params[SCALAR_SPATIAL_Y_SCALE]      = vsi_nn_kernel_scalar_create( graph, F32, &height_scale );
-            node_params[SCALAR_INPUT_WIDTH]          = vsi_nn_kernel_scalar_create( graph, F32, &in_width );
-            node_params[SCALAR_INPUT_HEIGHT]         = vsi_nn_kernel_scalar_create( graph, F32, &in_height );
+            node_params[SCALAR_INPUT_WIDTH]          = vsi_nn_kernel_scalar_create( graph, I32, &in_width );
+            node_params[SCALAR_INPUT_HEIGHT]         = vsi_nn_kernel_scalar_create( graph, I32, &in_height );
             node_params[SCALAR_RCP_OF_OUTPUT_WIDTH]  = vsi_nn_kernel_scalar_create( graph, F32, &rcp_of_out_width );
             node_params[SCALAR_RCP_OF_OUTPUT_HEIGHT] = vsi_nn_kernel_scalar_create( graph, F32, &rcp_of_out_height );
             node_params[SCALAR_SAMPLING_X_RATIO]     = vsi_nn_kernel_scalar_create( graph, F32, &sampling_x_ratio );
             node_params[SCALAR_SAMPLING_Y_RATIO]     = vsi_nn_kernel_scalar_create( graph, F32, &sampling_y_ratio );
             node_params[SCALAR_DEPTH]                = vsi_nn_kernel_scalar_create( graph, I32, &depth );
+            node_params[SCALAR_FORMAT]               = vsi_nn_kernel_scalar_create( graph, I32, &dtype );
 
             /* Pass parameters to node. */
             status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
@@ -332,6 +334,8 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_SAMPLING_X_RATIO] );
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_SAMPLING_Y_RATIO] );
             vsi_nn_kernel_scalar_release( &node_params[SCALAR_DEPTH] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_DEPTH] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_FORMAT] );
         }
     }
 
diff --git a/src/tim/vx/internal/src/kernel/cl/scatter_elements_cl.c b/src/tim/vx/internal/src/kernel/cl/scatter_elements_cl.c
new file mode 100644
index 0000000..2be6a78
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/scatter_elements_cl.c
@@ -0,0 +1,351 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    INTERNAL_KERNEL_SCATTER_ELEMENTS,
+} _internal_kernel_e;
+
+#define _KERNEL_SOURCE0      "scatter_elements"
+#define _KERNEL_SOURCE1      "scatter_elements_add"
+#define _KERNEL_SOURCE2      "scatter_elements_mul"
+
+#define STR(a) #a
+// Add kernel hashtable here
+#define SCATTER_ELEMENTS_HASH_KEY( IN0_DTYPE, IN2_DTYPE, OUT_DTYPE, AXIS, REDUCTION ) \
+        (( IN0_DTYPE ) | ( IN2_DTYPE << 8 ) | ( OUT_DTYPE << 16 ) | ( AXIS << 24 ) | ( REDUCTION << 28 ))
+
+#define PACK_KERNEL_NONE_MAP( IN0_DTYPE, IN2_DTYPE, OUT_DTYPE, AXIS ) \
+        { SCATTER_ELEMENTS_HASH_KEY( IN0_DTYPE, IN2_DTYPE, OUT_DTYPE, AXIS, VSI_NN_REDUCTION_TYPE_NONE ), \
+        CVIVANTE_NAMESPACE("cl.scatter_elements_axis"STR(AXIS)"_"STR(IN0_DTYPE) \
+                    "_I32_"STR(IN2_DTYPE)"to"STR(OUT_DTYPE)), \
+        _KERNEL_SOURCE0 }
+
+#define PACK_KERNEL_ADD_MAP( IN0_DTYPE, IN2_DTYPE, OUT_DTYPE, AXIS ) \
+        { SCATTER_ELEMENTS_HASH_KEY( IN0_DTYPE, IN2_DTYPE, OUT_DTYPE, AXIS, VSI_NN_REDUCTION_TYPE_ADD ), \
+        CVIVANTE_NAMESPACE("cl.scatter_elements_add_axis"STR(AXIS)"_"STR(IN0_DTYPE) \
+                    "_I32_"STR(IN2_DTYPE)"to"STR(OUT_DTYPE)), \
+        _KERNEL_SOURCE1 }
+
+#define PACK_KERNEL_MUL_MAP( IN0_DTYPE, IN2_DTYPE, OUT_DTYPE, AXIS ) \
+        { SCATTER_ELEMENTS_HASH_KEY( IN0_DTYPE, IN2_DTYPE, OUT_DTYPE, AXIS, VSI_NN_REDUCTION_TYPE_MUL ), \
+        CVIVANTE_NAMESPACE("cl.scatter_elements_mul_axis"STR(AXIS)"_"STR(IN0_DTYPE) \
+                    "_I32_"STR(IN2_DTYPE)"to"STR(OUT_DTYPE)), \
+        _KERNEL_SOURCE2 }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+
+#define PACK_KERNELS_MAP(type) \
+    PACK_KERNEL_NONE_MAP( type, type, type, 0 ), \
+    PACK_KERNEL_NONE_MAP( type, type, type, 1 ), \
+    PACK_KERNEL_ADD_MAP( type, type, type, 0 ), \
+    PACK_KERNEL_ADD_MAP( type, type, type, 1 ), \
+    PACK_KERNEL_MUL_MAP( type, type, type, 0 ), \
+    PACK_KERNEL_MUL_MAP( type, type, type, 1 ), \
+    PACK_KERNEL_MUL_MAP( type, type, type, 2 )
+
+static const _kernel_map_type _scatter_elements_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNELS_MAP( I8 ),
+    PACK_KERNELS_MAP( U8 ),
+    PACK_KERNELS_MAP( I16 ),
+    PACK_KERNELS_MAP( F16 ),
+    PACK_KERNELS_MAP( I32 ),
+    PACK_KERNELS_MAP( F32 ),
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _scatter_elements_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _SCATTER_ELEMENTS_PARAM_NUM  _cnt_of_array( _scatter_elements_kernel_param_def )
+#define SCALAR_INPUT_AXIS           (4)
+#define SCALAR_INPUT_REDUCTION      (5)
+#define SCALAR_REF_SCALE            (6)
+#define SCALAR_REF_TAIL             (7)
+#define SCALAR_UPDATE_SCALE         (8)
+#define SCALAR_UPDATE_TAIL          (9)
+#define SCALAR_OUTPUT_ZP            (10)
+#define SCALAR_INDICES_INNER        (11)
+#define SCALAR_INDICES_AXIS         (12)
+#define SCALAR_INDICES_OUTER        (13)
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_scatter_elements_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * output_attr = NULL;
+    vsi_size_array_t * out_shape              = NULL;
+
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    out_shape  = output_attr->shape;
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_scale[2]  = 1;
+
+    gpu_param.dim = (out_shape->size < 3 || 1 == out_shape->data[2]) ? 2 : 3;
+    gpu_param.global_size[0] = out_shape->data[0];
+    gpu_param.global_size[1] = out_shape->data[1];
+    gpu_param.global_size[2] = out_shape->size > 2 ? out_shape->data[2] : 1;
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(output_attr);
+    return status;
+} /* _scatter_elements_initializer() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t axis,
+    int32_t reduction
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype;
+    vsi_nn_kernel_dtype_e in1_dtype;
+    vsi_nn_kernel_dtype_e in2_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _scatter_elements_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _scatter_elements_kernel_map );
+    vx_param_description_t * param_def  = _scatter_elements_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _scatter_elements_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in0_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype  = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
+    in2_dtype  = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (in1_dtype != I32)
+    {
+        return VSI_FAILURE;
+    }
+
+    key = SCATTER_ELEMENTS_HASH_KEY( in0_dtype, in2_dtype, out_dtype, axis, reduction );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _scatter_elements_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_SCATTER_ELEMENTS_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_tensor_t* reshape_tensors[4] = { NULL };
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+    uint32_t rank_in = 0;
+    int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
+    int32_t reduction = vsi_nn_kernel_param_get_int32(params, "reduction");
+    int32_t new_axis0 = 0;
+    int32_t new_axis1 = 0;
+    int32_t inner_size = 0;
+    int32_t axis_size = 0;
+    int32_t outer_size = 0;
+    vsi_bool ret = FALSE;
+    float output_scale = vsi_nn_get_tensor_scale(outputs[0]);
+    float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float input0_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float input0_tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float input2_scale = vsi_nn_get_tensor_scale(inputs[2]);
+    float input2_tail = (float)vsi_nn_get_tensor_zero_point(inputs[2]);
+
+#define MAX_SHAPE_SIZE  (0xFFFFFFFF)
+    ret = vsi_nn_kernel_optimize_scatter_elements_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
+            shapes[0], &rank_in, &new_axis0, MAX_SHAPE_SIZE);
+    ret &= vsi_nn_kernel_optimize_scatter_elements_shape(
+            inputs[1]->attr.size, inputs[1]->attr.dim_num, axis,
+            shapes[1], &rank_in, &new_axis1, MAX_SHAPE_SIZE);
+#undef MAX_SHAPE_SIZE
+
+
+    if ( ret && new_axis0 == new_axis1 )
+    {
+        reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+                inputs[0], shapes[0], rank_in );
+        reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+                inputs[1], shapes[1], rank_in );
+        reshape_tensors[2] = vsi_nn_reshape_tensor( graph,
+                inputs[2], shapes[1], rank_in );
+        reshape_tensors[3] = vsi_nn_reshape_tensor( graph,
+                outputs[0], shapes[0], rank_in );
+
+        inner_size = new_axis0 == 0 ? 1 : (int32_t)shapes[1][0];
+        axis_size = new_axis0 == 0 ? (int32_t)shapes[1][0] : (int32_t)shapes[1][1];
+        outer_size = new_axis0 == 0 ?  (int32_t)shapes[1][1] : rank_in > 2 ? (int32_t)shapes[1][2] : 1;
+    }
+    else
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, axis, reduction );
+    if ( VSI_SUCCESS == status)
+    {
+        input0_scale = input0_scale / output_scale;
+        input0_tail = - input0_tail * input0_scale;
+        input2_scale = input2_scale / output_scale;
+        input2_tail = - input2_tail * input2_scale;
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _SCATTER_ELEMENTS_PARAM_NUM,
+                    reshape_tensors, input_num, &reshape_tensors[3], output_num );
+            /* Pass parameters to node. */
+            node_params[SCALAR_INPUT_AXIS] = vsi_nn_kernel_scalar_create(graph, I32, &new_axis0 );
+            node_params[SCALAR_INPUT_REDUCTION] = vsi_nn_kernel_scalar_create(graph, I32, &reduction );
+            node_params[SCALAR_REF_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &input0_scale );
+            node_params[SCALAR_REF_TAIL]   = vsi_nn_kernel_scalar_create(graph, F32, &input0_tail );
+            node_params[SCALAR_UPDATE_SCALE]  = vsi_nn_kernel_scalar_create( graph, F32, &input2_scale );
+            node_params[SCALAR_UPDATE_TAIL]   = vsi_nn_kernel_scalar_create(graph, F32, &input2_tail );
+            node_params[SCALAR_OUTPUT_ZP]   = vsi_nn_kernel_scalar_create(graph, F32, &output_zp );
+            node_params[SCALAR_INDICES_INNER]   = vsi_nn_kernel_scalar_create(graph, I32, &inner_size );
+            node_params[SCALAR_INDICES_AXIS]   = vsi_nn_kernel_scalar_create(graph, I32, &axis_size );
+            node_params[SCALAR_INDICES_OUTER]   = vsi_nn_kernel_scalar_create(graph, I32, &outer_size );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _SCATTER_ELEMENTS_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_AXIS] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_REDUCTION] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_REF_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_REF_TAIL] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_UPDATE_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_UPDATE_TAIL] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
+        }
+    }
+
+    vsi_safe_release_tensor( reshape_tensors[0] );
+    vsi_safe_release_tensor( reshape_tensors[1] );
+    vsi_safe_release_tensor( reshape_tensors[2] );
+    vsi_safe_release_tensor( reshape_tensors[3] );
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( scatter_elements, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cpu/bucketize_cpu.c b/src/tim/vx/internal/src/kernel/cpu/bucketize_cpu.c
new file mode 100644
index 0000000..b5bfbcb
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/bucketize_cpu.c
@@ -0,0 +1,229 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (2)
+#define _OUTPUT_NUM         (1)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.bucketize")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _bucketize_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _BUCKETIZE_PARAM_NUM  _cnt_of_array( _bucketize_kernel_param_def )
+#define SCALAR_RIGHT_VALUE          (3)
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    float *f32_in_buffer[_INPUT_NUM] = {NULL};
+    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
+    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
+    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
+    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
+    uint32_t  i = 0, j = 0;
+    int32_t right = 0;
+    uint32_t boundaries_size = 0;
+
+    /* prepare data */
+    for(i = 0; i < _INPUT_NUM; i ++)
+    {
+        input[i] = (vsi_nn_kernel_tensor_t)param[i];
+        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
+        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
+        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
+    }
+    for(i = 0; i < _OUTPUT_NUM; i ++)
+    {
+        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
+        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
+        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
+        out_bytes[i] = out_elements[i] * sizeof(float);
+        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
+        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
+        memset( f32_out_buffer[i], 0, out_bytes[i] );
+    }
+
+    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_RIGHT_VALUE], &(right));
+
+    boundaries_size = (uint32_t)in_attr[1]->shape->data[0];
+
+    for (i = 0; i < out_elements[0]; i++)
+    {
+        float src0 = f32_in_buffer[0][i];
+        float dst = 0;
+
+        for (j = 0; j < boundaries_size; j++)
+        {
+            float src1 = f32_in_buffer[1][j];
+
+            if (right == 1)
+            {
+                dst += (src0 >= src1 ? 1.0f : 0.0f);
+            }
+            else
+            {
+                dst += (src0 > src1 ? 1.0f : 0.0f);
+            }
+        }
+
+        f32_out_buffer[0][i] = dst;
+    }
+
+    /* save data */
+    for(i = 0; i < _OUTPUT_NUM; i++)
+    {
+        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
+                f32_out_buffer[i], out_elements[i] );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+
+final:
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        if (f32_in_buffer[i])
+        {
+            free(f32_in_buffer[i]);
+            f32_in_buffer[i] = NULL;
+        }
+        if (in_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
+        }
+    }
+    for(i = 0; i < _OUTPUT_NUM; i++)
+    {
+        if (f32_out_buffer[i])
+        {
+            free(f32_out_buffer[i]);
+            f32_out_buffer[i] = NULL;
+        }
+        if (out_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
+        }
+    }
+
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _bucketize_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _bucketize_kernel_param_def );
+    status = VSI_SUCCESS;
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_BUCKETIZE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t right  = vsi_nn_kernel_param_get_int32( params, "right" );
+
+    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _BUCKETIZE_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            node_params[SCALAR_RIGHT_VALUE] = vsi_nn_kernel_scalar_create( graph, I32, &right );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _BUCKETIZE_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_RIGHT_VALUE] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( bucketize, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cpu/lppool_cpu.c b/src/tim/vx/internal/src/kernel/cpu/lppool_cpu.c
new file mode 100644
index 0000000..0f66636
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/lppool_cpu.c
@@ -0,0 +1,264 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.lppool")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _lppool_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _LPPOOL_PARAM_NUM  _cnt_of_array( _lppool_kernel_param_def )
+
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_lppool_exec)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    float * buffer[_INPUT_NUM + _OUTPUT_NUM] = { NULL };
+    size_t out_elements = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[_INPUT_NUM + _OUTPUT_NUM] = { NULL };
+    int32_t ksize_x = 0, ksize_y = 0, stride_x = 0, stride_y = 0;
+    int32_t pad_left = 0, pad_right = 0, pad_top = 0, pad_bottom = 0;
+    int32_t p = 0;
+    int32_t i = 0;
+    input[0] = (vsi_nn_kernel_tensor_t)param[0];
+    output[0] = (vsi_nn_kernel_tensor_t)param[1];
+    attr[0] = vsi_nn_kernel_tensor_attr_create( input[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( output[0] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &ksize_x);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &ksize_y);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &pad_left);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &pad_right);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &pad_top);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &pad_bottom);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &stride_x);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &stride_y);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &p);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( input[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
+
+    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
+    memset( buffer[1], 0, out_elements * sizeof(float) );
+
+    {
+        int32_t batch    = (int32_t)attr[1]->shape->data[2];
+        int32_t height_o = (int32_t)attr[1]->shape->data[1];
+        int32_t width_o  = (int32_t)attr[1]->shape->data[0];
+        int32_t height   = (int32_t)attr[0]->shape->data[1];
+        int32_t width    = (int32_t)attr[0]->shape->data[0];
+        int32_t b = 0, j = 0;
+        int32_t output_base = 0;
+        int32_t input_base  = 0;
+        float data = 0;
+        for (b = 0; b < batch; b++)
+        {
+            output_base = b * height_o * width_o;
+            input_base = b * height * width;
+            for (j = 0; j < height_o; j++)
+            {
+                for (i = 0; i < width_o; i++)
+                {
+                    int32_t hstart = j * stride_y - pad_top;
+                    int32_t wstart = i * stride_x - pad_left;
+                    int32_t hend = vsi_nn_min(hstart + ksize_y, height);
+                    int32_t wend = vsi_nn_min(wstart + ksize_x, width);
+                    int32_t pool_index = output_base + j * width_o + i;
+                    int32_t h = 0, w = 0;
+                    float sum_of_pow = 0;
+                    float out_data = 0;
+                    hstart = vsi_nn_max(hstart, 0);
+                    wstart = vsi_nn_max(wstart, 0);
+
+                    for (h = hstart; h < hend; ++ h)
+                    {
+                        for (w = wstart; w < wend; ++ w)
+                        {
+                            int32_t index = input_base + h * width + w;
+                            data = buffer[0][index];
+                            sum_of_pow += (float)pow(fabs(data),p);
+                        }
+                    }
+                    out_data = (float)pow(sum_of_pow, 1.0f / p);
+                    buffer[1][pool_index] = out_data;
+                }
+            }
+        }
+
+    }
+    status = vsi_nn_kernel_tensor_write_from_float( output[0], attr[1],
+            buffer[1], out_elements );
+final:
+    for ( i = 0; i < _INPUT_NUM + _OUTPUT_NUM; i ++ )
+    {
+        vsi_nn_safe_free( buffer[i] );
+        if (attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &attr[i] );
+        }
+    }
+
+    return status;
+} /* _lppool_exec() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _lppool_exec;
+    kernel->info.parameters  = _lppool_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _lppool_kernel_param_def );
+    status = VSI_SUCCESS;
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_LPPOOL_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+
+    int32_t ksize_x    = vsi_nn_kernel_param_get_int32(params, "ksize_x");
+    int32_t ksize_y    = vsi_nn_kernel_param_get_int32(params, "ksize_y");
+    int32_t stride_x   = vsi_nn_kernel_param_get_int32(params, "stride_x");
+    int32_t stride_y   = vsi_nn_kernel_param_get_int32(params, "stride_y");
+    int32_t pad_left   = vsi_nn_kernel_param_get_int32(params, "pad_left");
+    int32_t pad_right  = vsi_nn_kernel_param_get_int32(params, "pad_right");
+    int32_t pad_top    = vsi_nn_kernel_param_get_int32(params, "pad_top");
+    int32_t pad_bottom = vsi_nn_kernel_param_get_int32(params, "pad_bottom");
+    int32_t p          = vsi_nn_kernel_param_get_int32(params, "p");
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            int32_t index = 2;
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _LPPOOL_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &ksize_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_left );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_right );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_top );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &pad_bottom );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &stride_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &p );
+
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _LPPOOL_PARAM_NUM );
+            VSI_ASSERT( status == VSI_SUCCESS );
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( lppool, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c b/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c
index e109349..183fedc 100644
--- a/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c
@@ -56,14 +56,26 @@ static vsi_ssize_t _expand_offset
     vsi_size_t i;
     vsi_ssize_t offset = 0;
 
-    for( i = 0; i < rank && index; i ++ )
+    for ( i = 0; i < rank && index; i ++ )
     {
-        if( shape[i] == out_shape[i] )
+        if (strides[0] == 0)
+        {
+            if (i == 0)
+            {
+                offset += (index % out_shape[0]);
+            }
+            else
+            {
+                offset += (vsi_ssize_t)strides[i] * 2 * ( index % out_shape[i] );
+            }
+        }
+        else if ( shape[i] == out_shape[i] )
         {
             offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] );
         }
         index /= out_shape[i];
     }
+
     return offset;
 }
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c b/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c
index 61d94c6..7cb6630 100644
--- a/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c
@@ -52,14 +52,26 @@ static vsi_ssize_t _expand_offset
     vsi_size_t i;
     vsi_ssize_t offset = 0;
 
-    for( i = 0; i < rank && index; i ++ )
+    for  ( i = 0; i < rank && index; i ++ )
     {
-        if( shape[i] == out_shape[i] )
+        if (strides[0] == 0)
+        {
+            if (i == 0)
+            {
+                offset += (index % out_shape[0]);
+            }
+            else
+            {
+                offset += (vsi_ssize_t)strides[i] * 2 * ( index % out_shape[i] );
+            }
+        }
+        else if ( shape[i] == out_shape[i] )
         {
             offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] );
         }
         index /= out_shape[i];
     }
+
     return offset;
 }
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv422_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv422_cpu.c
new file mode 100644
index 0000000..189ef8f
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv422_cpu.c
@@ -0,0 +1,405 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _CPU_ARG_NUM            (11)
+#define _CPU_INPUT_NUM          (1)
+#define _CPU_OUTPUT_NUM         (1)
+#define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
+#define _CPU_PARAM_NUM          (_CPU_ARG_NUM + _CPU_IO_NUM)
+#define _KERNEL_NAME            CVIVANTE_NAMESPACE("cpu.pre_process_yuv422_sw")
+
+#define DESCALE(x) (((x) + (1<<19)) >> 20)
+
+static vx_param_description_t kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+
+
+DEF_KERNEL_EXECUTOR(_pre_process_yuv422_exec)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
+    float * buffer[_CPU_IO_NUM] = { NULL };
+    float * outBuffer = NULL;
+    size_t out_elements = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
+    uint32_t i = 0;
+    int32_t xRatio = 0, yRatio = 0, xOffset = 0, yOffset = 0;
+    float rMean = 0, gMean = 0, bMean = 0, var = 0;
+    int32_t order = 0, trans = 0, yuv422_type = 0;
+
+    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
+    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[1] );
+
+    i = 2;
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xRatio);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yRatio);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &xOffset);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yOffset);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &rMean);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &gMean);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &bMean);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[i++], &var);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &order);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &trans);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[i++], &yuv422_type);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
+
+    buffer[1] = (float *)malloc( out_elements * sizeof(float) );
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create output buffer fail.", final );
+    memset( buffer[1], 0, out_elements * sizeof(float) );
+
+    if(trans)
+    {
+        outBuffer = (float *)malloc( out_elements * sizeof(float) );
+        CHECK_PTR_FAIL_GOTO( outBuffer, "Create output buffer fail.", final );
+        memset( outBuffer, 0, out_elements * sizeof(float) );
+    }
+
+    {
+        int32_t dx, dy, dz;
+        int32_t src_width = (int32_t)attr[0]->shape->data[0];
+        int32_t dst_width = (int32_t)(trans ? attr[1]->shape->data[1] : attr[1]->shape->data[0]);
+        int32_t dst_height = (int32_t)(trans ? attr[1]->shape->data[1] : attr[1]->shape->data[1]);
+        int32_t stride = (int32_t)(dst_width * dst_height);
+        int32_t rOffset = 0;
+        int32_t gOffset = 1 * stride;
+        int32_t bOffset = 2 * stride;
+        float D0, D1, E0, E1;
+        float R0, G0, B0, R1, G1, B1;
+        float min = 0;
+        float max = 255;
+        float* src_y_slice = NULL;
+
+        uint32_t roi_width = (xRatio * dst_width) >> 15;
+        uint32_t roi_height = (yRatio * dst_height) >> 15;
+        uint32_t xrIntFloat_16 = (roi_width << 16) / dst_width + 1;
+        uint32_t yrIntFloat_16 = (roi_height << 16) / dst_height + 1;
+        uint32_t srcy = 0, srcx = 0;
+
+        if(attr[1]->dtype == I8)
+        {
+            min = -128;
+            max = 127;
+        }
+        else if(attr[1]->dtype == I16 || attr[1]->dtype == F16)
+        {
+            min = -32768;
+            max = 32767;
+        }
+
+        if(order)
+        {
+            rOffset = 2 * stride;
+            bOffset = 0;
+        }
+
+        for ( dz = 0; dz < 1; dz ++)
+        {
+            for ( dy = 0; dy < (int32_t)dst_height; dy++)
+            {
+                srcy = (((uint32_t)dy * yrIntFloat_16) >> 16) + yOffset;
+                src_y_slice = buffer[0] + (srcy) * src_width;
+                for ( dx = 0; dx < (int32_t)dst_width; dx += 2)
+                {
+                    int32_t output_index = 0;
+                    int32_t dstR_idx = 0, dstG_idx = 0, dstB_idx = 0;
+                    float tmpY0 = 0.0f;
+                    float tmpY1 = 0.0f;
+                    float tmpU0 = 0.0f;
+                    float tmpU1 = 0.0f;
+                    float tmpV0 = 0.0f;
+                    float tmpV1 = 0.0f;
+
+                    srcx = ((((uint32_t)dx * xrIntFloat_16) >> 16) + xOffset) * 2;
+
+                    if (xrIntFloat_16 >> 16 == 1)
+                    {
+                        if (yuv422_type == 1)
+                        {
+                            tmpY0 = src_y_slice[srcx + 1];
+                            tmpU0 = src_y_slice[srcx];
+                            tmpY1 = src_y_slice[srcx + 3];
+                            tmpV0 = src_y_slice[srcx + 2];
+                            tmpU1 = tmpU0;
+                            tmpV1 = tmpV0;
+                        }
+                        else
+                        {
+                            tmpY0 = src_y_slice[srcx];
+                            tmpU0 = src_y_slice[srcx + 1];
+                            tmpY1 = src_y_slice[srcx + 2];
+                            tmpV0 = src_y_slice[srcx + 3];
+                            tmpU1 = tmpU0;
+                            tmpV1 = tmpV0;
+                        }
+                    }
+                    else
+                    {
+                        if (yuv422_type == 1)
+                        {
+                            tmpY0 = src_y_slice[srcx + 1];
+                            tmpU0 = src_y_slice[(srcx / 4) * 4];
+                            tmpV0 = src_y_slice[(srcx / 4) * 4 + 2];
+                            srcx = (((uint32_t)(dx + 1) * xrIntFloat_16) >> 16) + xOffset;
+                            srcx = srcx * 2;
+                            tmpY1 = src_y_slice[srcx + 1];
+                            tmpU1 = src_y_slice[(srcx / 4) * 4];
+                            tmpV1 = src_y_slice[(srcx / 4) * 4 + 2];
+                        }
+                        else
+                        {
+                            tmpY0 = src_y_slice[srcx];
+                            tmpU0 = src_y_slice[(srcx / 4) * 4 + 1];
+                            tmpV0 = src_y_slice[(srcx / 4) * 4 + 3];
+                            srcx = (((uint32_t)(dx + 1) * xrIntFloat_16) >> 16) + xOffset;
+                            srcx = srcx * 2;
+                            tmpY1 = src_y_slice[srcx];
+                            tmpU1 = src_y_slice[(srcx / 4) * 4 + 1];
+                            tmpV1 = src_y_slice[(srcx / 4) * 4 + 3];
+                        }
+                    }
+
+                    D0 = (tmpU0 - 128);
+                    E0 = (tmpV0 - 128);
+                    D1 = (tmpU1 - 128);
+                    E1 = (tmpV1 - 128);
+
+                    B0 = (float)vsi_clamp((tmpY0 + (1.7790 * D0)), min, max);
+                    G0 = (float)vsi_clamp((tmpY0 - 0.3455 * D0 - 0.7169 * E0), min, max);
+                    R0 = (float)vsi_clamp((tmpY0 + 1.4065 * E0), min, max);
+
+                    B1 = (float)vsi_clamp((tmpY1 + (1.7790 * D1)), min, max);
+                    G1 = (float)vsi_clamp((tmpY1 - 0.3455 * D1 - 0.7169 * E1), min, max);
+                    R1 = (float)vsi_clamp((tmpY1 + 1.4065 * E1), min, max);
+
+                    output_index = dx + dy * dst_width;
+
+                    dstR_idx = output_index + rOffset;
+                    dstG_idx = output_index + gOffset;
+                    dstB_idx = output_index + bOffset;
+
+                    buffer[1][dstB_idx] = (B0 - bMean) * var;
+                    buffer[1][dstG_idx] = (G0 - gMean) * var;
+                    buffer[1][dstR_idx] = (R0 - rMean) * var;
+
+                    dstR_idx += 1;
+                    dstG_idx += 1;
+                    dstB_idx += 1;
+
+                    buffer[1][dstB_idx] = (B1 - bMean) * var;
+                    buffer[1][dstG_idx] = (G1 - gMean) * var;
+                    buffer[1][dstR_idx] = (R1 - rMean) * var;
+                }
+            }
+        }
+    }
+
+    if(trans)
+    {
+        vsi_size_t shape[] = {attr[1]->shape->data[0], attr[1]->shape->data[1], attr[1]->shape->data[2], 1};
+        vsi_size_t perm[] = {1, 2, 0, 3};
+        vsi_nn_Transpose((uint8_t*)outBuffer, (uint8_t*)buffer[1],
+                        shape, (uint32_t)attr[1]->shape->size, perm, VSI_NN_TYPE_FLOAT32);
+
+        status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
+            outBuffer, out_elements );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+    else
+    {
+        status = vsi_nn_kernel_tensor_write_from_float( tensors[1], attr[1],
+                buffer[1], out_elements );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+
+final:
+    if(outBuffer)
+    {
+        free(outBuffer);
+    }
+    for( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+        if(attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
+    }
+    return status;
+} /* _pre_process_yuv422_exec() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _pre_process_yuv422_exec;
+    kernel->info.parameters  = kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( kernel_param_def );
+    status = VSI_SUCCESS;
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CPU_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    status = _query_kernel( kernel, inputs, outputs);
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            uint32_t index = 2;
+            int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
+            int32_t scale_y  = vsi_nn_kernel_param_get_int32( params, "scale_y" );
+            int32_t left     = vsi_nn_kernel_param_get_int32( params, "left" );
+            int32_t top      = vsi_nn_kernel_param_get_int32( params, "top" );
+            float r_mean     = vsi_nn_kernel_param_get_float32( params, "r_mean" );
+            float g_mean     = vsi_nn_kernel_param_get_float32( params, "g_mean" );
+            float b_mean     = vsi_nn_kernel_param_get_float32( params, "b_mean" );
+            float rgb_scale  = vsi_nn_kernel_param_get_float32( params, "rgb_scale" );
+            int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );
+            int32_t trans    = vsi_nn_kernel_param_get_int32( params, "enable_perm" );
+            int32_t yuv422_type = vsi_nn_kernel_param_get_int32( params, "yuv422_type" );
+
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _CPU_PARAM_NUM,
+                    inputs, _CPU_INPUT_NUM, outputs, _CPU_OUTPUT_NUM );
+
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
+            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &yuv422_type );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CPU_PARAM_NUM );
+            CHECK_STATUS( status );
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+            vsi_nn_kernel_scalar_release( &node_params[11] );
+        }
+        else
+        {
+            status = VSI_FAILURE;
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( pre_process_yuv422, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c b/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c
index 82e9c1a..071e5e7 100644
--- a/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c
@@ -73,7 +73,7 @@ static float _compute_region_coordinate(int32_t p, float bin_size, float roi_anc
 {
     const float region_start = p * bin_size + roi_anchor;
 
-    return vsi_nn_clamp(region_start, 0.0f, max_value - 1);
+    return region_start;
 }
 
 static float _roi_align_1x1(float *input_ptr,
@@ -88,53 +88,64 @@ static float _roi_align_1x1(float *input_ptr,
                            int32_t grid_size_y,
                            float   region_end_y)
 {
-    if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+    float avg = 0;
+    int32_t iy = 0;
+    int32_t ix = 0;
+    // Iterate through the aligned pooling region
+    for (iy = 0; iy < grid_size_y; ++iy)
     {
-        return 0;
-    }
-    else
-    {
-        float avg = 0;
-        int32_t iy = 0;
-        int32_t ix = 0;
-        // Iterate through the aligned pooling region
-        for (iy = 0; iy < grid_size_y; ++iy)
+        for (ix = 0; ix < grid_size_x; ++ix)
         {
-            for (ix = 0; ix < grid_size_x; ++ix)
-            {
-                // Align the window in the middle of every bin
-                float y = region_start_y + ((float)iy + 0.5f) * bin_size_y / (float)(grid_size_y);
-                float x = region_start_x + ((float)ix + 0.5f) * bin_size_x / (float)(grid_size_x);
+            // Align the window in the middle of every bin
+            float y = region_start_y +
+                      ((float)iy + 0.5f) * bin_size_y / (float)(grid_size_y);
+            float x = region_start_x +
+                      ((float)ix + 0.5f) * bin_size_x / (float)(grid_size_x);
 
-                // Interpolation in the [0,0] [0,1] [1,0] [1,1] square
-                const int32_t y_low  = (int32_t)y;
-                const int32_t x_low  = (int32_t)x;
-                const int32_t y_high = vsi_nn_min(y_low + 1, height - 1);
-                const int32_t x_high = vsi_nn_min(x_low + 1, width - 1);
+            // Interpolation in the [0,0] [0,1] [1,0] [1,1] square
+            const int32_t y_low = vsi_nn_min((int32_t)y, height - 1);
+            const int32_t x_low = vsi_nn_min((int32_t)x, width - 1);
+            const int32_t y_high = vsi_nn_min(y_low + 1, height - 1);
+            const int32_t x_high = vsi_nn_min(x_low + 1, width - 1);
 
-                const float ly = y - y_low;
-                const float lx = x - x_low;
-                const float hy = 1.0f - ly;
-                const float hx = 1.0f - lx;
+            float ly = y - y_low;
+            float lx = x - x_low;
+            float hy = 1.0f - ly;
+            float hx = 1.0f - lx;
 
-                const float w1 = hy * hx;
-                const float w2 = hy * lx;
-                const float w3 = ly * hx;
-                const float w4 = ly * lx;
+            float w1 = hy * hx;
+            float w2 = hy * lx;
+            float w3 = ly * hx;
+            float w4 = ly * lx;
 
-                const float data1 = *(input_ptr + y_low * width + x_low);
-                const float data2 = *(input_ptr + y_low * width + x_high);
-                const float data3 = *(input_ptr + y_high * width + x_low);
-                const float data4 = *(input_ptr + y_high * width + x_high);
+            const float data1 = *(input_ptr + y_low * width + x_low);
+            const float data2 = *(input_ptr + y_low * width + x_high);
+            const float data3 = *(input_ptr + y_high * width + x_low);
+            const float data4 = *(input_ptr + y_high * width + x_high);
 
-                avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-            }
+            /* onnx: inverse elements are out of feature map boundary */
+            if (x > width || x < -1 || y > height || y < -1) continue;
+
+            x = x_low >= width - 1 ? x_low : x;
+            y = y_low >= height - 1 ? y_low : y;
+
+            ly = y - y_low;
+            lx = x - x_low;
+            hy = 1.0f - ly;
+            hx = 1.0f - lx;
+
+            w1 = hy * hx;
+            w2 = hy * lx;
+            w3 = ly * hx;
+            w4 = ly * lx;
+
+            avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
         }
-
-        avg /= grid_size_x * grid_size_y;
-
-        return avg;
     }
+
+    avg /= grid_size_x * grid_size_y;
+
+    return avg;
 }
 
 DEF_KERNEL_EXECUTOR(_compute)
diff --git a/src/tim/vx/internal/src/kernel/cpu/scatter_elements_cpu.c b/src/tim/vx/internal/src/kernel/cpu/scatter_elements_cpu.c
new file mode 100644
index 0000000..b3cfbbc
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/scatter_elements_cpu.c
@@ -0,0 +1,258 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _ARG_NUM            (2)
+#define _INPUT_NUM          (3)
+#define _OUTPUT_NUM         (1)
+#define _CPU_IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
+#define _CPU_PARAM_NUM          (_ARG_NUM + _CPU_IO_NUM)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.gather_elements")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _scatter_elements_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _SCATTER_ELEMENTS_PARAM_NUM  _cnt_of_array( _scatter_elements_kernel_param_def )
+
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL };
+    float * buffer[3] = { NULL };
+    int32_t* buffer_idx = NULL;
+    size_t out_elements = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
+    vsi_size_t a = 0;
+    vsi_size_t o = 0;
+    vsi_size_t i = 0;
+    vsi_size_t outer_size[2] = {1, 1};
+    vsi_size_t inner_size[2] = {1, 1};
+    vsi_size_t axis_size[2] = {1, 1};
+    int32_t axis = 0;
+    int32_t reduction = 0;
+
+    tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
+    tensors[1]  = (vsi_nn_kernel_tensor_t)param[1];
+    tensors[2]  = (vsi_nn_kernel_tensor_t)param[2];
+    tensors[3]  = (vsi_nn_kernel_tensor_t)param[3];
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
+    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
+    attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
+    CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final );
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &axis);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &reduction);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input0 buffer fail.", final );
+
+    buffer_idx = (int32_t*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE );
+    CHECK_PTR_FAIL_GOTO( buffer_idx, "Create input1 buffer fail.", final );
+
+    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], TRUE );
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create input0 buffer fail.", final );
+
+    buffer[2] = (float *)malloc( out_elements * sizeof(float) );
+    CHECK_PTR_FAIL_GOTO( buffer[2], "Create output buffer fail.", final );
+    memcpy( buffer[2], buffer[0], out_elements * sizeof(float) );
+
+    axis_size[0] = attr[0]->shape->data[axis];
+    axis_size[1] = attr[1]->shape->data[axis];
+    for (i = 0; i < (vsi_size_t)axis; ++i)
+    {
+        inner_size[0] *= attr[0]->shape->data[i];
+        inner_size[1] *= attr[1]->shape->data[i];
+    }
+
+    for (i = axis + 1; i < attr[1]->shape->size; ++i)
+    {
+        outer_size[0] *= attr[0]->shape->data[i];
+        outer_size[1] *= attr[1]->shape->data[i];
+    }
+
+    for (o = 0; o < outer_size[1]; o++)
+    {
+        for (a = 0; a < axis_size[1]; a++)
+        {
+            for (i = 0; i < inner_size[1]; i++)
+            {
+                vsi_ssize_t index = 0;
+                vsi_size_t index0 = (o * axis_size[1] + a) * inner_size[1] + i;
+                vsi_size_t index1 = 1;
+
+                index = (vsi_ssize_t)buffer_idx[index0];
+                index1 = (o * axis_size[0] + index) * inner_size[0] + i;
+
+                switch (reduction)
+                {
+                    case VSI_NN_REDUCTION_TYPE_NONE:
+                        buffer[2][index1] = buffer[1][index0];
+                        break;
+                    case VSI_NN_REDUCTION_TYPE_ADD:
+                        buffer[2][index1] += buffer[1][index0];
+                        break;
+                    case VSI_NN_REDUCTION_TYPE_MUL:
+                        buffer[2][index1] *= buffer[1][index0];
+                        break;
+                    default:
+                        break;
+                }
+
+
+            }
+        }
+    }
+
+    status = vsi_nn_kernel_tensor_write_from_float( tensors[3], attr[3],
+            buffer[2], out_elements );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+final:
+    if ( buffer_idx )
+    {
+        free( buffer_idx );
+    }
+    for ( i = 0; i < 3; i ++ )
+    {
+        if ( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+    }
+    for ( i = 0; i < _CPU_IO_NUM; i ++ )
+    {
+        if (attr[i]) { vsi_nn_kernel_tensor_attr_release( &attr[i] ); }
+    }
+
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _scatter_elements_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _scatter_elements_kernel_param_def );
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_SCATTER_ELEMENTS_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
+    int32_t reduction = vsi_nn_kernel_param_get_int32( params, "reduction" );
+
+    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _SCATTER_ELEMENTS_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &axis );
+            node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &reduction );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _SCATTER_ELEMENTS_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( scatter_elements, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c b/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c
new file mode 100644
index 0000000..d7074c3
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/bucketize_evis.c
@@ -0,0 +1,323 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    INTERNAL_KERNEL_BUCKETIZE,
+} _internal_kernel_e;
+
+#define STR(a) #a
+
+// Add kernel hashtable here
+#define BUCKETIZE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ) \
+        (( IN0_DTYPE ) | ( IN1_DTYPE << 8 ) | ( OUT_DTYPE << 16 ) | (RIGHT << 24) | (IMG_2D << 25))
+
+#define PACK_KERNEL_2D_MAP( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ) \
+        { BUCKETIZE_HASH_KEY( IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, RIGHT, IMG_2D ), \
+        CVIVANTE_NAMESPACE("evis.bucketize_right_"STR(IN0_DTYPE)"_"STR(IN1_DTYPE)"to"STR(OUT_DTYPE)"_2D"), \
+        "bucketize" }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _bucketize_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_2D_MAP( F16, F16, I32, 1, 1 ),
+    PACK_KERNEL_2D_MAP( I16, I16, I32, 1, 1 ),
+    PACK_KERNEL_2D_MAP( U8,  U8,  I32, 1, 1 ),
+    PACK_KERNEL_2D_MAP( I8,  I8,  I32, 1, 1 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _bucketize_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _BUCKETIZE_PARAM_NUM  _cnt_of_array( _bucketize_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_bucketize_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * input0_attr   = NULL;
+    vsi_nn_kernel_tensor_attr_t * input1_attr   = NULL;
+    vsi_size_array_t * input0_shape             = NULL;
+    vsi_size_array_t * input1_shape             = NULL;
+
+    input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( input0_attr, "Create tensor attr buffer fail.", final );
+    input1_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( input1_attr, "Create tensor attr buffer fail.", final );
+
+    input0_shape  = input0_attr->shape;
+    input1_shape  = input1_attr->shape;
+
+    gpu_param.dim = 2;
+    gpu_param.global_scale[0]  = 8;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_size[0] = gpu_align_p2(
+            (input0_shape->data[0] + gpu_param.global_scale[0] - 1)
+            / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1] = (
+            (input0_shape->data[1] + gpu_param.global_scale[1] - 1)
+            / gpu_param.global_scale[1]);
+
+    {
+        gpu_dp_inst_t uniDataConvert_0_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00010000, 0x00030002, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000300, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniDataConvert_1_4x4 = {{
+            0x01010101, // TCfg
+            0x00000000, // ASelt
+            0x00050004, 0x00070006, // ABin
+            0x02020202, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000300, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000000, 0x00000001, 0x00000000,
+            0x00000001, 0x00000000, 0x00000001, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        int32_t boundaries_size = (int32_t)input1_shape->data[0];
+        int32_t boundaries_size_x8 = (boundaries_size / 8) * 8;
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniDataConvert_0_4x4", &uniDataConvert_0_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniDataConvert_1_4x4", &uniDataConvert_1_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "boundaries_size_x8", &boundaries_size_x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "boundaries_size", &boundaries_size);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+#define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(input0_attr);
+    SAFE_FREE_TENSOR_ATTR(input1_attr);
+#undef SAFE_FREE_TENSOR_ATTR
+
+    return status;
+} /* _bucketize_initializer() */
+
+static vsi_bool _bucketize_support_types
+    (
+    vsi_nn_graph_t   * graph,
+    vsi_nn_tensor_t  * input,
+    vsi_nn_tensor_t  * boundaries,
+    int32_t            right
+    )
+{
+    vsi_size_t width = input->attr.size[0];
+    vsi_size_t height = input->attr.size[1];
+    vsi_size_t boundaries_size = boundaries->attr.size[0];
+    vsi_bool image_2d = FALSE;
+    vsi_nn_kernel_dtype_e in_dtype  = vsi_nn_kernel_map_dtype( input->attr.dtype.vx_type );
+
+    image_2d = (input->attr.dim_num == 2 || input->attr.size[2] == 1);
+
+    if ( vsi_nn_is_same_type(input, boundaries) == FALSE || right == 0 || image_2d == FALSE )
+    {
+        return FALSE;
+    }
+
+    if (in_dtype == F16 && graph->ctx->config.evis.ver != VSI_NN_HW_EVIS_2)
+    {
+        return FALSE;
+    }
+
+#define MAX_16BITS_BOUNDARIES_SIZE  (0xFFFF)
+    if ( (in_dtype == F16 || in_dtype == I16) && boundaries_size > MAX_16BITS_BOUNDARIES_SIZE )
+    {
+        return FALSE;
+    }
+#undef MAX_16BITS_BOUNDARIES_SIZE
+
+#define MAX_8BITS_BOUNDARIES_SIZE  (0xFF)
+    if ( (in_dtype == I8 || in_dtype == U8) && boundaries_size > MAX_8BITS_BOUNDARIES_SIZE )
+    {
+        return FALSE;
+    }
+#undef MAX_8BITS_BOUNDARIES_SIZE
+
+#define INPUT_SIZE_ALIGN8   (8)
+    if ( width % INPUT_SIZE_ALIGN8 != 0 && height != 1 )
+    {
+        return FALSE;
+    }
+#undef INPUT_SIZE_ALIGN8
+
+    return TRUE;
+}
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t right
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in0_dtype;
+    vsi_nn_kernel_dtype_e in1_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _bucketize_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _bucketize_kernel_map );
+    vx_param_description_t * param_def  = _bucketize_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _bucketize_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in0_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    key = BUCKETIZE_HASH_KEY( in0_dtype, in1_dtype, out_dtype, right, 1 );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _bucketize_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_BUCKETIZE_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t right = vsi_nn_kernel_param_get_int32( params, "right" );
+
+    if( !vsi_nn_kernel_gpu_check_shape(
+        inputs[0]->attr.size, inputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    if ( _bucketize_support_types(graph, inputs[0], inputs[1], right) == FALSE )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, right );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _BUCKETIZE_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _BUCKETIZE_PARAM_NUM );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( bucketize, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
index 510069b..be2db5e 100644
--- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
@@ -158,7 +158,7 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
         if (srcFixPointPos >= 0)
             output_scale *= (vx_float32)((int64_t)1 << srcFixPointPos);
         else if (srcFixPointPos < 0)
-            output_scale *= 1.0f / (vx_float32) ((int64_t)1 << -srcFixPointPos);
+            output_scale *= 1.0f / (vx_float32) ((int64_t)1 << - srcFixPointPos);
     }
     else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr[0]->quant )
     {
diff --git a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
index f641e10..af31e07 100644
--- a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
@@ -47,7 +47,8 @@ __BEGIN_DECLS
 typedef enum
 {
     INTERNAL_KERNEL_SUMS,
-    INTERNAL_KERNEL_NORM,
+    INTERNAL_KERNEL_MEANS,
+    INTERNAL_KERNEL_NORMS,
 } _internal_kernel_e;
 
 #define KERNEL_SOURCE_0    "instance_normalization_0"
@@ -61,6 +62,9 @@ typedef enum
 #define HASH_INSTANCENORM_SUMS_SH_KERNEL_2D_NAME(SRC0_TYPE) \
     CVIVANTE_NAMESPACE("evis.instance_norm_sums_"#SRC0_TYPE"_2D")
 
+#define HASH_INSTANCENORM_MEANS_SH_KERNEL_NAME() \
+    CVIVANTE_NAMESPACE("evis.instance_norm_means")
+
 #define HASH_INSTANCENORM_SCALE_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \
     CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"_F32to"#DST_TYPE)
 
@@ -68,8 +72,8 @@ typedef enum
     CVIVANTE_NAMESPACE("evis.instance_norm_"#SRC0_TYPE"_F32to"#DST_TYPE"_2D")
 
 // Add kernel hashtable here
-#define HASH_INSTANCENORM_SUMS_KEY(_input0_type, _output_type, _reshape_flag) \
-    ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
+#define HASH_INSTANCENORM_SUMS_KEY(_input0_type, _output_type, _img_2d) \
+    ((_input0_type << 24) | (_output_type << 16) | (_img_2d << 8))
 
 #define TENSOR_INSTANCENORM_SUMS_KERNELS_3D(IN0_TYPE, OUT_TYPE, SOURCE) \
     { HASH_INSTANCENORM_SUMS_KEY(IN0_TYPE, OUT_TYPE, 0), \
@@ -81,6 +85,14 @@ typedef enum
         HASH_INSTANCENORM_SUMS_SH_KERNEL_2D_NAME(IN0_TYPE), \
         SOURCE },
 
+#define HASH_INSTANCENORM_MEANS_KEY(ALPHA_TYPE, BETA_TYPE) \
+    ((F32 << 24) | (ALPHA_TYPE << 16) | (BETA_TYPE << 8) | (F32))
+
+#define TENSOR_INSTANCENORM_MEANS_KERNELS(ALPHA_TYPE, BETA_TYPE) \
+    { HASH_INSTANCENORM_MEANS_KEY(ALPHA_TYPE, BETA_TYPE), \
+        HASH_INSTANCENORM_MEANS_SH_KERNEL_NAME(), \
+        KERNEL_SOURCE_0 },
+
 // normalization
 #define HASH_INSTANCENORM_KEY(_input0_type, _input1_type, _output_type, _reshape_flag) \
     ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_reshape_flag << 4))
@@ -117,6 +129,13 @@ static const _kernel_map_type _instancenorm_sums_kernel_map[] =
     TENSOR_INSTANCENORM_SUMS_KERNELS_2D( BF16, F32, KERNEL_SOURCE_3 )
 };
 
+static const _kernel_map_type _instancenorm_means_kernel_map[] =
+{
+    // Register kernel here
+    TENSOR_INSTANCENORM_MEANS_KERNELS( F32, F32 )
+};
+
+
 static const _kernel_map_type _instancenorm_kernel_map[] =
 {
     // Register kernel here
@@ -162,15 +181,36 @@ static vx_param_description_t _instancenorm_sums_kernel_param_def[] =
 };
 #define _INSTANCENORM_SUMS_PARAM_NUM  _cnt_of_array( _instancenorm_sums_kernel_param_def )
 
-static vx_param_description_t _instancenorm_kernel_param_def[] =
+static vx_param_description_t _instancenorm_means_kernel_param_def[] =
 {
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
-    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _INSTANCENORM_MEANS_PARAM_NUM  _cnt_of_array( _instancenorm_means_kernel_param_def )
+#define MEANS_EPS_SCL               (4)
+#define MEANS_INPUT_SCALE_SCL       (5)
+#define MEANS_INPUT_ZP_SCL          (6)
+#define MEANS_OUTPUT_SCALE_SCL      (7)
+#define MEANS_OUTPUT_ZP_SCL         (8)
+#define MEANS_INV_MULTIPLIER_SCL    (9)
+#define MEANS_GROUP_NUM_SCL         (10)
+
+static vx_param_description_t _instancenorm_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     // Add kererl parameters here
 };
 #define _INSTANCENORM_PARAM_NUM  _cnt_of_array( _instancenorm_kernel_param_def )
@@ -195,7 +235,6 @@ DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer)
 
     vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
     vsi_size_array_t * input_shape = NULL;
-    int32_t rs_flag = 0;
     int32_t width = 0;
     int32_t height = 0;
     int32_t chn = 0;
@@ -212,7 +251,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer)
     attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
     CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
 
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &rs_flag);
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &height);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     input_shape = attr[0]->shape;
@@ -221,12 +260,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer)
     input_zp    = (float)attr[0]->zero_point;
 
     width = (int32_t)(input_shape->data[0]);
-    height = (int32_t)(input_shape->data[1]);
     chn = (int32_t)(attr[1]->shape->data[1]);
-    if (rs_flag)
-    {
-        height = height / chn;
-    }
 
     work_item_pixels = (float)height * 16;
 
@@ -333,7 +367,6 @@ DEF_KERNEL_INITIALIZER(_instancenorm_sums_initializer)
     }
 
     status = vsi_nn_kernel_gpu_add_param(node, "width", &width);
-    status |= vsi_nn_kernel_gpu_add_param(node, "height", &height);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
 OnError:
@@ -351,6 +384,55 @@ OnError:
     return status;
 }
 
+DEF_KERNEL_INITIALIZER(_instancenorm_means_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        2,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL};
+    vsi_size_array_t * input_shape = NULL;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+
+    input_shape = attr[0]->shape;
+
+    shaderParam.global_scale[0]  = 1;
+    shaderParam.global_scale[1]  = 1;
+
+    shaderParam.global_size[0]   = 1;
+    shaderParam.global_size[1]   = input_shape->data[1];
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+        attr[1] = NULL;
+    }
+
+    return status;
+}
+
 DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
     (
     vsi_nn_kernel_node_t                node,
@@ -366,52 +448,26 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
         {0, 0, 0},  // localWorkSize: local group size in thread
         {0, 0, 0}}; // globalWorkSize: image size in thread
 
-    vsi_nn_kernel_tensor_attr_t* attr[4] = {NULL, NULL};
+    vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL, NULL};
     vsi_size_array_t * input_shape = NULL;
-    float input_scale = 1;
-    float output_scale = 1;
-    float input_zp = 0;
-    float output_zp = 0;
-    float inv_multiplier = 0;
-    vx_uint32 group_num = 0;
-    vx_int32 height = 0, width = 0, chn = 0;
-    int32_t rs_flag = 0;
+    vx_int32 width = 0, chn = 0;
 
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
     CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
-    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
     CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
-    attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] );
-    CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
-    attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] );
-    CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", OnError );
-
-    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &rs_flag);
-    CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     input_shape  = attr[0]->shape;
-    input_scale  = attr[0]->scale;
-    input_zp = (float)attr[0]->zero_point;
-    output_scale = 1.0f / attr[3]->scale;
-    output_zp = (float)attr[3]->zero_point;
 
     width = (int32_t)(input_shape->data[0]);
-    height = (int32_t)(input_shape->data[1]);
-    chn = (int32_t)(attr[2]->shape->data[1]);
-    if (rs_flag)
-    {
-        height = height / chn;
-    }
-
-    inv_multiplier = (float)(1.0 / (width * height));
-
-    group_num = (width + 255) / 256;
+    chn = (int32_t)(attr[1]->shape->data[1]);
 
     shaderParam.global_scale[0]  = 16;
     if (attr[0]->dtype == I16 || attr[0]->dtype == F16 || attr[0]->dtype == BF16)
     {
         shaderParam.global_scale[0]  = 8;
-        group_num = (width + 127) / 128;
     }
 
     shaderParam.global_scale[1]  = 1;
@@ -521,12 +577,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
 #define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE )    \
         (IN0_TYPE | (OUT_TYPE << 16))
 
-        pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[3]->dtype );
-
-        status  = vsi_nn_kernel_gpu_add_param(node, "height", &height);
-        status |= vsi_nn_kernel_gpu_add_param(node, "inv_multiplier", &inv_multiplier);
-        status |= vsi_nn_kernel_gpu_add_param(node, "group_num", &group_num);
-        CHECK_STATUS_FAIL_GOTO(status, OnError );
+        pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype );
 
         switch( pack_key )
         {
@@ -535,7 +586,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
             case _PACK_SELECT_KEY( U8, U8 ):
             case _PACK_SELECT_KEY( I8, I8 ):
                 {
-                    if (attr[3]->dtype == F16)
+                    if (attr[2]->dtype == F16)
                     {
                         status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8",
                             &uniExtractHalf8_2x8);
@@ -544,11 +595,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
                     {
                         status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8",
                             &uniExtractInteger_2x8);
-                        status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
-                        status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
                     }
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &input_scale);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp);
                     status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_0_4x4",
                         &uniDataToFP32_0_4x4);
                     status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_1_4x4",
@@ -567,7 +614,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
             case _PACK_SELECT_KEY( F16, U8 ):
             case _PACK_SELECT_KEY( F16, I8 ):
                 {
-                    if (attr[3]->dtype == F16)
+                    if (attr[2]->dtype == F16)
                     {
                         status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8",
                             &uniExtractHalf8_2x8);
@@ -577,14 +624,10 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer)
                         status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8",
                             &uniExtractInteger_2x8);
                     }
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &input_scale);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "input_zp", &input_zp);
                     status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_0_4x4",
                         &uniDataToFP32_0_4x4);
                     status |= vsi_nn_kernel_gpu_add_param(node, "uniDataToFP32_1_4x4",
                         &uniDataToFP32_1_4x4);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
-                    status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
                     CHECK_STATUS_FAIL_GOTO(status, OnError );
                 }
                 break;
@@ -612,21 +655,18 @@ OnError:
         vsi_nn_kernel_tensor_attr_release( &attr[0] );
         attr[0] = NULL;
     }
+
     if (attr[1])
     {
         vsi_nn_kernel_tensor_attr_release( &attr[1] );
         attr[1] = NULL;
     }
+
     if (attr[2])
     {
         vsi_nn_kernel_tensor_attr_release( &attr[2] );
         attr[2] = NULL;
     }
-    if (attr[3])
-    {
-        vsi_nn_kernel_tensor_attr_release( &attr[3] );
-        attr[3] = NULL;
-    }
 
     return status;
 }
@@ -637,7 +677,9 @@ OnError:
 static vsi_status _query_kernel
     (
     vsi_nn_kernel_t * kernel,
-    const uint32_t hashkey,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    vsi_bool img_2d,
     _internal_kernel_e kernel_id
     /* Add extra params */
     )
@@ -649,6 +691,18 @@ static vsi_status _query_kernel
     size_t kernel_map_size = 0;
     size_t param_size = 0;
     uint32_t i = 0;
+    uint32_t hashkey = 0;
+    vsi_nn_kernel_dtype_e in0_dtype = U8;
+    vsi_nn_kernel_dtype_e in1_dtype = F16;
+    vsi_nn_kernel_dtype_e in2_dtype = F16;
+    vsi_nn_kernel_dtype_e out_dtype = U8;
+
+    in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in1_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
+    in2_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+    in1_dtype = in1_dtype == F16 ? F32 : in1_dtype;
+    in2_dtype = in2_dtype == F16 ? F32 : in2_dtype;
 
     switch ( kernel_id )
     {
@@ -658,13 +712,23 @@ static vsi_status _query_kernel
             kernel_map_size = _cnt_of_array( _instancenorm_sums_kernel_map );
             param_def = _instancenorm_sums_kernel_param_def;
             param_size = _INSTANCENORM_SUMS_PARAM_NUM;
+            hashkey = HASH_INSTANCENORM_SUMS_KEY( in0_dtype, F32, img_2d );
             break;
-        case INTERNAL_KERNEL_NORM:
+        case INTERNAL_KERNEL_MEANS:
+            initializer = _instancenorm_means_initializer;
+            kernel_map = _instancenorm_means_kernel_map;
+            kernel_map_size = _cnt_of_array( _instancenorm_means_kernel_map );
+            param_def = _instancenorm_means_kernel_param_def;
+            param_size = _INSTANCENORM_MEANS_PARAM_NUM;
+            hashkey = HASH_INSTANCENORM_MEANS_KEY( in1_dtype, in2_dtype );
+            break;
+        case INTERNAL_KERNEL_NORMS:
             initializer = _instancenorm_initializer;
             kernel_map = _instancenorm_kernel_map;
             kernel_map_size = _cnt_of_array( _instancenorm_kernel_map );
             param_def = _instancenorm_kernel_param_def;
             param_size = _INSTANCENORM_PARAM_NUM;
+            hashkey = HASH_INSTANCENORM_KEY( in0_dtype, F32, out_dtype, img_2d );
             break;
         default:
             VSI_ASSERT( FALSE );
@@ -709,23 +773,21 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_t             * kernel
     )
 {
-#define INTERNAL_KERNEL_SIZE    (1)
-#define MEAN_VARI_INDEX  (0)
+#define INTERNAL_KERNEL_SIZE    (2)
+#define SUMS_INDEX              (0)
+#define MEANS_INDEX             (1)
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_node_param_t sums_node_params[_INSTANCENORM_SUMS_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_param_t means_node_params[_INSTANCENORM_MEANS_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_param_t node_params[_INSTANCENORM_PARAM_NUM] = { NULL };
-    vsi_nn_kernel_node_t tmp_node = NULL;
-    vsi_nn_kernel_node_t node = NULL;
-    vsi_nn_kernel_dtype_e in0_dtype = U8;
-    vsi_nn_kernel_dtype_e in1_dtype = F16;
-    vsi_nn_kernel_dtype_e out_dtype = U8;
+    vsi_nn_kernel_node_t sums_node = NULL;
+    vsi_nn_kernel_node_t means_node = NULL;
+    vsi_nn_kernel_node_t norms_node = NULL;
     vsi_nn_tensor_attr_t attr;
     vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL };
     vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL };
     vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL, rs_gamma = NULL, rs_beta = NULL;
     vsi_size_t  shape[VSI_NN_MAX_DIM_NUM] = {0};
-    uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 };
-    uint32_t hashkey = 0;
     int32_t i = 0;
     int32_t axis[VSI_NN_MAX_DIM_NUM] = {0, 1};
     int32_t axis_num  = 2;
@@ -735,35 +797,47 @@ static vsi_nn_kernel_node_t _setup
     uint32_t rank = outputs[0]->attr.dim_num;
     vsi_nn_tensor_t *reshape_tensor[2] = {NULL};
     float eps  = vsi_nn_kernel_param_get_float32( params, "eps" );
+    float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
+    float in_time_out_scale = vsi_nn_get_tensor_scale(inputs[0]) * output_scale;
+    float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float inv_multiplier = 1.0f / (float)(inputs[0]->attr.size[0] * inputs[0]->attr.size[1]);
+    int32_t height = 0;
+    int32_t group_num = 0;
     int32_t reshape_flg  = 0;
     vsi_size_t batch = 1;
     vsi_bool ret = FALSE;
 
-    ret = vsi_nn_kernel_optimize_tensor_shape(
-        inputs[0]->attr.size, inputs[0]->attr.dim_num,
-        axis, axis_num, new_shape, &rank, new_axis, &axis_size);
-    if ( ret == FALSE || axis_size > 2 )
-    {
-        return NULL;
-    }
+    memcpy(new_shape, inputs[0]->attr.size, sizeof(inputs[0]->attr.size));
 
-    for (i = 3; i < (int32_t)inputs[0]->attr.dim_num; i++)
+    if (new_shape[0] >= GPU_TENSOR_MAX_WIDTH || new_shape[1] >= GPU_TENSOR_MAX_WIDTH)
     {
-        batch *= inputs[0]->attr.size[i];
-    }
-
-    if (axis_size == 1)
-    {
-        for (i = rank; i > 1; i--)
+        ret = vsi_nn_kernel_optimize_tensor_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            axis, axis_num, new_shape, &rank, new_axis, &axis_size);
+        if ( ret == FALSE || axis_size > 2 )
         {
-            new_shape[i] = new_shape[i - 1];
+            return NULL;
         }
-        new_shape[1] = 1;
-        rank ++;
+
+        for (i = 3; i < (int32_t)inputs[0]->attr.dim_num; i++)
+        {
+            batch *= inputs[0]->attr.size[i];
+        }
+
+        if (axis_size == 1)
+        {
+            for (i = rank; i > 1; i--)
+            {
+                new_shape[i] = new_shape[i - 1];
+            }
+            new_shape[1] = 1;
+            rank ++;
+        }
+        new_shape[2] = rank == 2 ? 1 : new_shape[2] / batch;
+        new_shape[3] = batch;
+        rank = 4;
     }
-    new_shape[2] = rank == 2 ? 1 : new_shape[2] / batch;
-    new_shape[3] = batch;
-    rank = 4;
 
     reshape_tensor[0] = vsi_nn_reshape_tensor( graph,
             inputs[0], new_shape, rank );
@@ -786,24 +860,7 @@ static vsi_nn_kernel_node_t _setup
         ikernels[i]->unique_id = kernel->unique_id;
     }
 
-    in0_dtype = vsi_nn_kernel_map_dtype( reshape_tensor[0]->attr.dtype.vx_type );
-    in1_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
-    out_dtype = vsi_nn_kernel_map_dtype( reshape_tensor[1]->attr.dtype.vx_type );
-    in1_dtype = in1_dtype == F16 ? F32 : in1_dtype;
-
-    hashkeys[MEAN_VARI_INDEX]= HASH_INSTANCENORM_SUMS_KEY( in0_dtype, F32, reshape_flg );
-    hashkey = HASH_INSTANCENORM_KEY( in0_dtype, in1_dtype, out_dtype, reshape_flg );
-
-    status = _query_kernel( ikernels[MEAN_VARI_INDEX], hashkeys[MEAN_VARI_INDEX], INTERNAL_KERNEL_SUMS );
-    if ( VSI_SUCCESS != status )
-    {
-        goto final;
-    }
-    status = _query_kernel( kernel, hashkey, INTERNAL_KERNEL_NORM );
-    if ( VSI_SUCCESS != status )
-    {
-        goto final;
-    }
+    height = (int32_t)new_shape[1];
 
     if (reshape_flg)
     {
@@ -816,6 +873,8 @@ static vsi_nn_kernel_node_t _setup
     }
     else if (new_shape[0] < new_shape[1])
     {
+        height = (int32_t)new_shape[0];
+
         shape[0] = new_shape[1];
         shape[1] = new_shape[0];
         shape[2] = new_shape[2];
@@ -835,78 +894,121 @@ static vsi_nn_kernel_node_t _setup
     attr.is_const = FALSE;
     attr.vtl = TRUE;
     attr.size[0] = ((shape[0] + 255) / 256) * 4;
+    group_num = gpu_align_np2_safe((int32_t)shape[0], 256) / 256;
     if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
         || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16
         || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16)
     {
+        group_num = gpu_align_np2_safe((int32_t)shape[0], 128) / 128;
         attr.size[0] = ((shape[0] + 127) / 128) * 4;
     }
     attr.size[1] = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1;
     attr.size[2] = 1;
     attr.size[3] = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
     attr.dim_num = 4;
-    tensors[MEAN_VARI_INDEX] = vsi_nn_CreateTensor( graph, &attr );
+    tensors[SUMS_INDEX] = vsi_nn_CreateTensor( graph, &attr );
+    attr.size[0] = 4;
+    tensors[MEANS_INDEX] = vsi_nn_CreateTensor( graph, &attr );
 
     shape[0] = 1;
     shape[1] = rank > 2 ? new_shape[2] : 1;
     rs_beta = vsi_nn_kernel_tensor_reshape( inputs[1]->t, shape, 2 );
     rs_gamma = vsi_nn_kernel_tensor_reshape( inputs[2]->t, shape, 2 );
 
-    // Mean Vari
+    /* x0 = sum(x) and x1 = sum(x * x) */
+    status = _query_kernel( ikernels[SUMS_INDEX], inputs, outputs, reshape_flg, INTERNAL_KERNEL_SUMS );
+    if ( VSI_SUCCESS != status )
     {
-        tmp_node = vsi_nn_kernel_create_node( graph, ikernels[MEAN_VARI_INDEX] );
-        if (tmp_node)
+        goto final;
+    }
+
+    sums_node = vsi_nn_kernel_create_node( graph, ikernels[SUMS_INDEX] );
+    if (sums_node)
+    {
+        uint32_t index = 0;
+
+
+        sums_node_params[index++] = rs_input;
+        vsi_nn_kernel_node_pack_io( &sums_node_params[index],
+                        _INSTANCENORM_SUMS_PARAM_NUM, NULL, 0, tensors, 1 );
+        index = 2;
+        sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
+        sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
+
+        status  = vsi_nn_kernel_node_pass_param( sums_node, sums_node_params,
+                    _INSTANCENORM_SUMS_PARAM_NUM );
+        CHECK_STATUS(status);
+        vsi_nn_kernel_scalar_release( &sums_node_params[2] );
+        vsi_nn_kernel_scalar_release( &sums_node_params[3] );
         {
-            uint32_t index = 0;
+            // Set default border mode.
+            vx_border_t border;
+            border.mode = VX_BORDER_CONSTANT;
 
-            sums_node_params[index++] = rs_input;
-            vsi_nn_kernel_node_pack_io( &sums_node_params[index],
-                            _INSTANCENORM_SUMS_PARAM_NUM, NULL, 0, tensors, 1 );
-            index = 2;
-            sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
-            sums_node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg );
+            vsi_nn_Float32ToDtype(0, (uint8_t*)&border.constant_value.U32, &inputs[0]->attr.dtype);
 
-            status  = vsi_nn_kernel_node_pass_param( tmp_node, sums_node_params,
-                        _INSTANCENORM_SUMS_PARAM_NUM );
+            status = vxSetNodeAttribute( (vx_node)sums_node, VX_NODE_BORDER, &border, sizeof(border) );
             CHECK_STATUS(status);
-            vsi_nn_kernel_scalar_release( &sums_node_params[2] );
-            vsi_nn_kernel_scalar_release( &sums_node_params[3] );
-            {
-                // Set default border mode.
-                vx_border_t border;
-                border.mode = VX_BORDER_CONSTANT;
-
-                vsi_nn_Float32ToDtype(0, (uint8_t*)&border.constant_value.U32, &inputs[0]->attr.dtype);
-
-                status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) );
-                CHECK_STATUS(status);
-            }
         }
     }
 
-    // Nomalization
+    /* a = input_scale * output_scale * alpha * mean
+      b = (beta - scale * mean) * output_scale + output_zp - input * alpha  */
+    status = _query_kernel( ikernels[MEANS_INDEX], inputs, outputs, reshape_flg, INTERNAL_KERNEL_MEANS );
+    if ( VSI_SUCCESS != status )
     {
-        node = vsi_nn_kernel_create_node( graph, kernel );
-        if (node)
-        {
-            uint32_t index = 0;
-            node_params[index++] = rs_input;
-            node_params[index++] = rs_beta;
-            node_params[index++] = rs_gamma;
-            node_params[index++] = (vsi_nn_kernel_node_param_t)tensors[MEAN_VARI_INDEX]->t;
-            node_params[index++] = rs_output;
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
-            node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reshape_flg );
-
-            status  = vsi_nn_kernel_node_pass_param( node, node_params,
-                        _INSTANCENORM_PARAM_NUM );
-            CHECK_STATUS(status);
-            vsi_nn_kernel_scalar_release( &node_params[5] );
-            vsi_nn_kernel_scalar_release( &node_params[6] );
-        }
+        goto final;
+    }
+
+    means_node = vsi_nn_kernel_create_node( graph, ikernels[MEANS_INDEX] );
+    if (means_node)
+    {
+        means_node_params[0] = tensors[SUMS_INDEX]->t;
+        means_node_params[1] = rs_beta;
+        means_node_params[2] = rs_gamma;
+        means_node_params[3] = tensors[MEANS_INDEX]->t;
+
+        means_node_params[MEANS_EPS_SCL] = vsi_nn_kernel_scalar_create( graph, F32, &eps );
+        means_node_params[MEANS_INPUT_SCALE_SCL] = vsi_nn_kernel_scalar_create( graph, F32, &in_time_out_scale );
+        means_node_params[MEANS_INPUT_ZP_SCL] = vsi_nn_kernel_scalar_create( graph, F32, &input_zp );
+        means_node_params[MEANS_OUTPUT_SCALE_SCL] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
+        means_node_params[MEANS_OUTPUT_ZP_SCL] = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
+        means_node_params[MEANS_INV_MULTIPLIER_SCL] = vsi_nn_kernel_scalar_create( graph, F32, &inv_multiplier );
+        means_node_params[MEANS_GROUP_NUM_SCL] = vsi_nn_kernel_scalar_create( graph, I32, &group_num );
+
+        status  = vsi_nn_kernel_node_pass_param( means_node, means_node_params,
+                    _INSTANCENORM_MEANS_PARAM_NUM );
+        CHECK_STATUS(status);
+        vsi_nn_kernel_scalar_release( &means_node_params[MEANS_EPS_SCL] );
+        vsi_nn_kernel_scalar_release( &means_node_params[MEANS_INPUT_SCALE_SCL] );
+        vsi_nn_kernel_scalar_release( &means_node_params[MEANS_INPUT_ZP_SCL] );
+        vsi_nn_kernel_scalar_release( &means_node_params[MEANS_OUTPUT_SCALE_SCL] );
+        vsi_nn_kernel_scalar_release( &means_node_params[MEANS_OUTPUT_ZP_SCL] );
+        vsi_nn_kernel_scalar_release( &means_node_params[MEANS_INV_MULTIPLIER_SCL] );
+        vsi_nn_kernel_scalar_release( &means_node_params[MEANS_GROUP_NUM_SCL] );
+    }
+
+    /* dst = x * a + b  */
+    status = _query_kernel( kernel, inputs, outputs, reshape_flg, INTERNAL_KERNEL_NORMS );
+    if ( VSI_SUCCESS != status )
+    {
+        goto final;
+    }
+    norms_node = vsi_nn_kernel_create_node( graph, kernel );
+    if (norms_node)
+    {
+        uint32_t index = 0;
+        node_params[index++] = rs_input;
+        node_params[index++] = tensors[MEANS_INDEX]->t;
+        node_params[index++] = rs_output;
+        node_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &height );
+
+        status  = vsi_nn_kernel_node_pass_param( norms_node, node_params,
+                    _INSTANCENORM_PARAM_NUM );
+        CHECK_STATUS(status);
+        vsi_nn_kernel_scalar_release( &node_params[3] );
     }
 
-    /* Pass parameters to node. */
 final:
     vsi_safe_release_tensor(reshape_tensor[0]);
     vsi_safe_release_tensor(reshape_tensor[1]);
@@ -934,8 +1036,10 @@ final:
         }
         vsi_safe_release_tensor(tensors[i]);
     }
-    if (tmp_node) {vsi_nn_kernel_node_release( &tmp_node );}
-    return node;
+    if (sums_node) {vsi_nn_kernel_node_release( &sums_node );}
+    if (means_node) {vsi_nn_kernel_node_release( &means_node );}
+
+    return norms_node;
 } /* _setup() */
 
 __END_DECLS
diff --git a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
index 5825491..8157779 100644
--- a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
@@ -121,6 +121,7 @@ static const struct {
     TENSOR_MATRIX_MUL_TRANSB_KERNELS(F16, U8,  U8,     KERNEL_SOURCE_4)
     TENSOR_MATRIX_MUL_TRANSB_KERNELS(U8,  U8,  F16,    KERNEL_SOURCE_5)
     TENSOR_MATRIX_MUL_TRANSB_KERNELS(U8,  U8,  U8,     KERNEL_SOURCE_5)
+    TENSOR_MATRIX_MUL_TRANSB_KERNELS(I16, I16, I16,    KERNEL_SOURCE_13)
     TENSOR_MATRIX_MUL_TRANSB_KERNELS(BF16,BF16,BF16,   KERNEL_SOURCE_15)
     TENSOR_MATRIX_MUL_TRANSA_KERNELS(U8,  U8,  U8,     KERNEL_SOURCE_7)
     TENSOR_MATRIX_MUL_TRANSA_KERNELS(I8,  I8,  I8,     KERNEL_SOURCE_7)
@@ -622,11 +623,33 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
             0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
         }, GPU_DP_TYPE_16};
 
+        gpu_dp_inst_t uniI16MulI16SumtoI32_16x1 = {{
+            0xaaaa5555, // TCfg
+            0x00000000, // ASelt
+            0x76543210, 0x76543210, // ABin
+            0xaaaa5555, // BSelt
+            0x76543210, 0x00000000, // BBin
+            0x00000300, // AccumType, ConstantType, and PostShift
+            0x00000000, 0x00000000, 0x00000000, 0x00000000,
+            0x00020001, 0x00040003, 0x00060005, 0x00080007 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniI16MulI16SumtoI32B_16x1 = {{
+            0x0002aaab, // TCfg
+            0x00015554, // ASelt
+            0x65432100, 0x00000007, // ABin
+            0x0002aaa8, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00002300, // AccumType, ConstantType, and PostShift
+            0x00010000, 0x00030002, 0x00050004, 0x00070006,
+            0x00000008, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
         float scaleIn0divOut = src0Scale / dstScale;
         float scaleIn1divOut = src1Scale / dstScale;
         float inScaleMul = src0Scale * src1Scale;
         float reScaleOut = 1 / dstScale;
         float inScaledivOut = inScaleMul / dstScale;
+        float inout_beta = src0ZP * src1ZP * 8 * inScaledivOut + dstZP;
         uint32_t multiplierA = (M0 << 16) | M0;
         uint32_t multiplierB = (M1 << 16) | M1;
         uint32_t multiplierZpA = (src0ZP << 16) | src0ZP;
@@ -647,6 +670,14 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
             uniGemmFp16U8MulZptoFp32_4x4.data[i] = multiplierZpB;
             uniGemmFp16I16MulZptoFp32_4x4.data[i] = multiplierZpB;
         }
+        for( i = 8; i < 12; i++)
+        {
+            uniI16MulI16SumtoI32B_16x1.data[i] = multiplierZpA;
+        }
+        for( i = 12; i < 16; i++)
+        {
+            uniI16MulI16SumtoI32_16x1.data[i] = multiplierZpB;
+        }
 
         switch( pack_key )
         {
@@ -746,6 +777,8 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
             break;
         case _PACK_SELECT_KEY( I16, I16, I16, 0, 0, 0 ):
         case _PACK_SELECT_KEY( I16, I16, I16, 0, 0, 1 ):
+        case _PACK_SELECT_KEY( I16, I16, I16, 0, 1, 0 ):
+        case _PACK_SELECT_KEY( I16, I16, I16, 0, 1, 1 ):
             {
                 status = vsi_nn_kernel_gpu_add_param( node,
                         "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8 );
@@ -753,10 +786,16 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer)
                         "uniConvertUint8SubZpToFp32_4x4", &uniConvertUint8SubZpToFp32_4x4 );
                 status |= vsi_nn_kernel_gpu_add_param( node,
                         "uniConvertUint8SubZpToFp32B_4x4", &uniConvertUint8SubZpToFp32B_4x4 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                        "uniI16MulI16SumtoI32_16x1", &uniI16MulI16SumtoI32_16x1 );
+                status |= vsi_nn_kernel_gpu_add_param( node,
+                        "uniI16MulI16SumtoI32B_16x1", &uniI16MulI16SumtoI32B_16x1 );
                 status |= vsi_nn_kernel_gpu_add_param( node, "input0_ZP", &src0ZP );
                 status |= vsi_nn_kernel_gpu_add_param( node, "input1_ZP", &src1ZP );
                 status |= vsi_nn_kernel_gpu_add_param( node, "output_ZP", &dstZP );
                 status |= vsi_nn_kernel_gpu_add_param( node, "outputScale", &reScaleOut );
+                status |= vsi_nn_kernel_gpu_add_param( node, "inout_scale", &inScaledivOut );
+                status |= vsi_nn_kernel_gpu_add_param( node, "inout_beta", &inout_beta );
             }
             break;
         case _PACK_SELECT_KEY( F16, U8,  F16, 0, 0, 0 ):
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c
index e92b248..2c529ce 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c
@@ -43,14 +43,18 @@ __BEGIN_DECLS
 #define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOU8     CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toU8")
 #define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOI8     CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toI8")
 #define VX_KERNEL_NAME_PRE_PROCESS_NV12_COPY_U8TOU8      CVIVANTE_NAMESPACE("evis.pre_process_nv12_copy_U8toU8")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_COPY_U8TOI8      CVIVANTE_NAMESPACE("evis.pre_process_nv12_copy_U8toI8")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_COPY_U8TOI16     CVIVANTE_NAMESPACE("evis.pre_process_nv12_copy_U8toI16")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_COPY_U8TOF16     CVIVANTE_NAMESPACE("evis.pre_process_nv12_copy_U8toF16")
 
 // greater than a quarter
 #define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOU8_GQ  CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toU8_gq")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOI8_GQ  CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toU8_gq")
 #define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOF16_GQ CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toF16_gq")
+#define VX_KERNEL_NAME_PRE_PROCESS_NV12_SCALE_U8TOI16_GQ CVIVANTE_NAMESPACE("evis.pre_process_nv12_scale_U8toI16_gq")
 
-#define KERNEL_SOURCE_1    "pre_process_nv12_scale_8bits",
+#define KERNEL_SOURCE_1    "pre_process_nv12_copy",
 #define KERNEL_SOURCE_2    "pre_process_nv12_scale",
-#define KERNEL_SOURCE_4    "pre_process_nv12_scale_mix"
 
 typedef enum
 {
@@ -78,13 +82,18 @@ static const struct {
         const char* source_name;
     } pre_process_nv12_map[] =
 {
-    TENSOR_PRE_PROCESS_NV12_KERNELS(U8, U8,  SCALE,        KERNEL_SOURCE_1)
-    TENSOR_PRE_PROCESS_NV12_KERNELS(U8, I8,  SCALE,        KERNEL_SOURCE_1)
     TENSOR_PRE_PROCESS_NV12_KERNELS(U8, U8,  COPY,         KERNEL_SOURCE_1)
+    TENSOR_PRE_PROCESS_NV12_KERNELS(U8, I8,  COPY,         KERNEL_SOURCE_1)
+    TENSOR_PRE_PROCESS_NV12_KERNELS(U8, I16, COPY,         KERNEL_SOURCE_1)
+    TENSOR_PRE_PROCESS_NV12_KERNELS(U8, F16, COPY,         KERNEL_SOURCE_1)
+    TENSOR_PRE_PROCESS_NV12_KERNELS(U8, U8,  SCALE,        KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_NV12_KERNELS(U8, I8,  SCALE,        KERNEL_SOURCE_2)
     TENSOR_PRE_PROCESS_NV12_KERNELS(U8, F16, SCALE,        KERNEL_SOURCE_2)
     TENSOR_PRE_PROCESS_NV12_KERNELS(U8, I16, SCALE,        KERNEL_SOURCE_2)
-    TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, U8,  SCALE,     KERNEL_SOURCE_4)
-    TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, F16, SCALE,     KERNEL_SOURCE_4)
+    TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, U8,  SCALE,     KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, F16, SCALE,     KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, I8,  SCALE,     KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_NV12_GQ_KERNELS(U8, I16, SCALE,     KERNEL_SOURCE_2)
 };
 
 static vx_param_description_t vxPreProcessNv12Kernel_param_def[] =
@@ -120,8 +129,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
         {0, 0, 0},  // localWorkSize: local group size in thread
         {0, 0, 0}}; // globalWorkSize: image size in thread
 
-    int32_t     dstZP      = 0;
-    float       dstScale   = 1;
+    float       output_zp      = 0;
+    float       output_scale   = 1;
     int32_t     reorder    = 0;
     int32_t     order1     = 2;
     uint32_t    width      = 0;
@@ -148,6 +157,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     out_shape  = attr[0]->shape;
+    output_scale = 1.0f / attr[0]->scale;
+    output_zp = (float)attr[0]->zero_point;
     width      = (uint32_t)(out_shape->data[0]);
     height     = (uint32_t)(out_shape->data[1]);
 
@@ -157,33 +168,10 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
         order1 = 0;
     }
 
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        dstScale = 1.0f / attr[0]->asymm.scale;
-        dstZP = attr[0]->asymm.zero_point;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            dstScale = (vx_float32)((int64_t)1 << attr[0]->dfp.fl);
-        }
-        else
-        {
-            dstScale = (1.0f / (vx_float32)((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        dstZP = 0;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE)
-    {
-        dstScale = 1;
-        dstZP = 0;
-    }
-
-    outputScaleVar = dstScale * var;
-    bMeanScaleVarZp = (float)dstZP - bMean * outputScaleVar;
-    gMeanScaleVarZp = (float)dstZP - gMean * outputScaleVar;
-    rMeanScaleVarZp = (float)dstZP - rMean * outputScaleVar;
+    outputScaleVar = output_scale * var;
+    bMeanScaleVarZp = output_zp - bMean * outputScaleVar;
+    gMeanScaleVarZp = output_zp - gMean * outputScaleVar;
+    rMeanScaleVarZp = output_zp - rMean * outputScaleVar;
 
     shaderParam.global_scale[0]  = 4;
     shaderParam.global_scale[1]  = 1;
@@ -249,18 +237,46 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer)
             0x00010001, 0x00010001, 0x00010001, 0x00010001,
             0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
         }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
 
         status = vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toB_4x4", &uniConvertNV12toB_4x4);
         status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toG_4x4", &uniConvertNV12toG_4x4);
         status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertNV12toR_4x4", &uniConvertNV12toR_4x4);
         status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
         status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
-        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8);
         status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8);
         status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar);
         status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
         status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
         status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
+        switch( attr[0]->dtype )
+        {
+        case U8:
+        case I8:
+        case I16:
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        case F16:
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        default:
+            break;
+        }
         CHECK_STATUS_FAIL_GOTO(status, OnError );
     }
 
@@ -288,8 +304,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
         {0, 0, 0},  // localWorkSize: local group size in thread
         {0, 0, 0}}; // globalWorkSize: image size in thread
 
-    int32_t     dstZP      = 0;
-    float       dstScale   = 1;
+    float       output_zp      = 0;
+    float       output_scale   = 1;
     int32_t     reorder    = 0;
     int32_t     order1     = 2;
     uint32_t    width      = 0;
@@ -330,8 +346,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
     out_shape  = attr[1]->shape;
-    dstZP      = attr[1]->asymm.zero_point;
-    dstScale   = attr[1]->asymm.scale;
+    output_scale = 1.0f / attr[1]->scale;
+    output_zp = (float)attr[1]->zero_point;
     width      = (uint32_t)(out_shape->data[0]);
     height     = (uint32_t)(out_shape->data[1]);
 
@@ -347,32 +363,10 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
     xrIntFloat_16 = (uint32_t)((roi_width << 16) / width + 1);
     yrIntFloat_16 = (uint32_t)((roi_height << 16) / height + 1);
 
-    if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        dstScale = 1.0f / dstScale;
-    }
-    else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[1]->dfp.fl > 0)
-        {
-            dstScale = (vx_float32)((int64_t)1 << attr[1]->dfp.fl);
-        }
-        else
-        {
-            dstScale = (1.0f / (vx_float32)((int64_t)1 << -attr[1]->dfp.fl));
-        }
-        dstZP = 0;
-    }
-    else if (attr[1]->quant == VSI_NN_KERNEL_QUANT_NONE)
-    {
-        dstScale = 1;
-        dstZP = 0;
-    }
-
-    outputScaleVar = dstScale * var;
-    bMeanScaleVarZp = (float)dstZP - bMean * outputScaleVar;
-    gMeanScaleVarZp = (float)dstZP - gMean * outputScaleVar;
-    rMeanScaleVarZp = (float)dstZP - rMean * outputScaleVar;
+    outputScaleVar = output_scale * var;
+    bMeanScaleVarZp = output_zp - bMean * outputScaleVar;
+    gMeanScaleVarZp = output_zp - gMean * outputScaleVar;
+    rMeanScaleVarZp = output_zp - rMean * outputScaleVar;
 
     shaderParam.global_scale[0]  = 4;
     shaderParam.global_scale[1]  = 1;
@@ -482,7 +476,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
         status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
         status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
 
-        if (resize >= 0.25 && (attr[1]->dtype == U8 || attr[1]->dtype == F16))
+        if (resize >= 0.25)
         {
             status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateYShift_2x8", &uniCalculateYShift_2x8);
             status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateUVShift_2x8", &uniCalculateUVShift_2x8);
@@ -499,13 +493,13 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer)
         case I8:
         case I16:
             {
-                status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8);
+                status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8);
                 CHECK_STATUS_FAIL_GOTO(status, OnError );
             }
             break;
         case F16:
             {
-                status = vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", &uniConvertHalftoFp16_2x8);
+                status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8);
                 CHECK_STATUS_FAIL_GOTO(status, OnError );
             }
             break;
@@ -551,7 +545,7 @@ static vsi_status _query_kernel
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    if (enable_copy && output_dtype == U8)
+    if (enable_copy)
     {
         convert_type = COPY;
     }
@@ -560,16 +554,16 @@ static vsi_status _query_kernel
         convert_type = SCALE;
     }
 
-    if (scaleVal >= 0.25 && (output_dtype == U8 || output_dtype == F16) && convert_type == SCALE)
+    if (scaleVal >= 0.25 && convert_type == SCALE)
     {
         optFlg = 1;
     }
 
     key = HASH_PRE_PROCESS_NV12_KEY( input0_dtype, output_dtype, convert_type, optFlg );
 
-    for( i = 0; i < _cnt_of_array(pre_process_nv12_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(pre_process_nv12_map); i ++ )
     {
-        if( pre_process_nv12_map[i].key == key )
+        if ( pre_process_nv12_map[i].key == key )
         {
             break;
         }
@@ -580,7 +574,7 @@ static vsi_status _query_kernel
         kernel->info.parameters = vxPreProcessNv12Kernel_param_def;
         kernel->info.numParams = _cnt_of_array( vxPreProcessNv12Kernel_param_def );
 
-        if(convert_type == COPY)
+        if (convert_type == COPY)
         {
             kernel->info.initialize = _pre_process_nv12_copy_initializer;
         }
@@ -666,10 +660,8 @@ static vsi_nn_kernel_node_t _setup
             vsi_nn_kernel_scalar_release( &tmp_params[12] );
         }
     }
-    if(reshape_tensors[0])
-    {
-        vsi_nn_ReleaseTensor(&reshape_tensors[0]);
-    }
+    vsi_safe_release_tensor(reshape_tensors[0]);
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c
index a51eab1..8e5f779 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c
@@ -43,13 +43,13 @@ __BEGIN_DECLS
 #define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOU8     CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toU8")
 #define VX_KERNEL_NAME_PRE_PROCESS_YUV420_SCALE_U8TOI8     CVIVANTE_NAMESPACE("evis.pre_process_yuv420_scale_U8toI8")
 #define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOU8      CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toU8")
+#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOI8      CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toI8")
+#define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOI16     CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toI16")
 #define VX_KERNEL_NAME_PRE_PROCESS_YUV420_COPY_U8TOF16     CVIVANTE_NAMESPACE("evis.pre_process_yuv420_copy_U8toF16")
 
-#define KERNEL_SOURCE_1    "pre_process_yuv420_scale_u8",
-#define KERNEL_SOURCE_2    "pre_process_yuv420_copy_u8",
-#define KERNEL_SOURCE_3    "pre_process_yuv420_scale_fp16",
-#define KERNEL_SOURCE_4    "pre_process_yuv420_scale_i16",
-#define KERNEL_SOURCE_5    "pre_process_yuv420_scale_i8",
+#define KERNEL_SOURCE_0    "pre_process_yuv420_copy",
+#define KERNEL_SOURCE_1    "pre_process_yuv420_scale_0",
+#define KERNEL_SOURCE_2    "pre_process_yuv420_scale_1",
 
 typedef enum
 {
@@ -73,12 +73,14 @@ static const struct {
         const char* source_name;
     } pre_process_yuv420_map[] =
 {
-    TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, F16, SCALE,        KERNEL_SOURCE_3)
-    TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I16, SCALE,        KERNEL_SOURCE_4)
+    TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, F16, SCALE,        KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I16, SCALE,        KERNEL_SOURCE_2)
     TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8,  SCALE,        KERNEL_SOURCE_1)
-    TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I8,  SCALE,        KERNEL_SOURCE_5)
-    TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8,  COPY,         KERNEL_SOURCE_2)
-    TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, F16, COPY,         KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I8,  SCALE,        KERNEL_SOURCE_1)
+    TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, U8,  COPY,         KERNEL_SOURCE_0)
+    TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, F16, COPY,         KERNEL_SOURCE_0)
+    TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I8,  COPY,         KERNEL_SOURCE_0)
+    TENSOR_PRE_PROCESS_YUV420_KERNELS(U8, I16, COPY,         KERNEL_SOURCE_0)
 };
 
 static vx_param_description_t vxPreProcessYuv420Kernel_param_def[] =
@@ -115,13 +117,13 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
         {0, 0, 0},  // localWorkSize: local group size in thread
         {0, 0, 0}}; // globalWorkSize: image size in thread
 
-    int32_t     dstZP      = 0;
-    float       dstScale   = 1;
-    int32_t     reorder    = 0;
-    int32_t     trans      = 0;
-    int32_t     order1     = 2;
-    uint32_t    width      = 0;
-    uint32_t    height     = 0;
+    float       output_zp       = 0;
+    float       output_scale    = 1;
+    int32_t     reorder         = 0;
+    int32_t     trans           = 0;
+    int32_t     order1          = 2;
+    uint32_t    width           = 0;
+    uint32_t    height          = 0;
 
     vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
     vsi_size_array_t * out_shape = NULL;
@@ -149,23 +151,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
         width = width / 3;
     }
 
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        dstScale = 1.0f / attr[0]->asymm.scale;
-        dstZP = attr[0]->asymm.zero_point;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            dstScale = (float)((int64_t)1 << attr[0]->dfp.fl);
-        }
-        else
-        {
-            dstScale = (1.0f / (float)((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        dstZP = 0;
-    }
+    output_scale = 1.0f / attr[0]->scale;
+    output_zp = (float)attr[0]->zero_point;
 
     shaderParam.global_scale[0]  = 16;
     shaderParam.global_scale[1]  = 1;
@@ -426,8 +413,10 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
         }, GPU_DP_TYPE_16 };
         switch( attr[0]->dtype )
         {
+        case I8:
         case U8:
         case F16:
+        case I16:
             {
                 // R
                 status |= vsi_nn_kernel_gpu_add_param(node, "uniCalculateTmpR1st_4x4", &uniCalculateTmpR1st_4x4);
@@ -461,8 +450,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer)
                 status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8LoR_2x8", &uniQuantU8toU8LoR_2x8);
                 status |= vsi_nn_kernel_gpu_add_param(node, "uniQuantU8toU8HiR_2x8", &uniQuantU8toU8HiR_2x8);
 
-                status |= vsi_nn_kernel_gpu_add_param(node, "zp", &dstZP);
-                status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale);
+                status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+                status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
                 status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
                 status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
                 CHECK_STATUS_FAIL_GOTO(status, OnError );
@@ -497,8 +486,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
         {0, 0, 0},  // localWorkSize: local group size in thread
         {0, 0, 0}}; // globalWorkSize: image size in thread
 
-    int32_t     dstZP      = 0;
-    float       dstScale   = 1;
+    float       output_zp      = 0;
+    float       output_scale   = 1;
     int32_t     reorder    = 0;
     int32_t     order1     = 2;
     uint32_t    width      = 0;
@@ -513,11 +502,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &reorder);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
 
-    out_shape  = attr[0]->shape;
-    dstZP      = attr[0]->asymm.zero_point;
-    dstScale   = attr[0]->asymm.scale;
-    width      = (uint32_t)(out_shape->data[0]);
-    height     = (uint32_t)(out_shape->data[1]);
+    out_shape    = attr[0]->shape;
+    output_zp    = (float)attr[0]->zero_point;
+    output_scale = 1.0f / attr[0]->scale;
+    width        = (uint32_t)(out_shape->data[0]);
+    height       = (uint32_t)(out_shape->data[1]);
 
     if (reorder != 0)
     {
@@ -525,28 +514,6 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
         order1 = 0;
     }
 
-    if (attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP)
-    {
-        if (attr[0]->dfp.fl > 0)
-        {
-            dstScale = (vx_float32)((int64_t)1 << attr[0]->dfp.fl);
-        }
-        else
-        {
-            dstScale = (1.0f / (vx_float32)((int64_t)1 << -attr[0]->dfp.fl));
-        }
-        dstZP = 0;
-    }
-    else if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM)
-    {
-        dstScale = 1.0f / dstScale;
-    }
-    else if ( attr[0]->quant == VSI_NN_KERNEL_QUANT_NONE )
-    {
-        dstScale = 1;
-        dstZP = 0;
-    }
-
     shaderParam.global_scale[0]  = 4;
     shaderParam.global_scale[1]  = 1;
     shaderParam.global_scale[2]  = 1;
@@ -822,24 +789,20 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer)
         switch( attr[0]->dtype )
         {
         case U8:
-            {
-                status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale);
-                status |= vsi_nn_kernel_gpu_add_param(node, "zp", &dstZP);
-                CHECK_STATUS_FAIL_GOTO(status, OnError );
-            }
-            break;
+        case F16:
         case I8:
         case I16:
             {
-                status = vsi_nn_kernel_gpu_add_param(node, "uniConvertInt32toUint8_2x8", &uniConvertInt32toUint8_2x8);
-                status |= vsi_nn_kernel_gpu_add_param(node, "outputScale", &dstScale);
-                CHECK_STATUS_FAIL_GOTO(status, OnError );
-            }
-            break;
-        case F16:
-            {
-                status = vsi_nn_kernel_gpu_add_param(node, "uniConvertHalftoFp16_2x8", &uniConvertHalftoFp16_2x8);
+                if (attr[0]->dtype == F16)
+                {
+                    status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8);
+                }
+                else
+                {
+                    status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8);
+                }
+                status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
+                status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
                 CHECK_STATUS_FAIL_GOTO(status, OnError );
             }
             break;
@@ -876,12 +839,14 @@ static vsi_status _query_kernel
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    if (enable_copy && (output_dtype == U8 || output_dtype == F16))
+    if (enable_copy && (output_dtype == I8 || output_dtype == U8 || output_dtype == F16 || output_dtype == I16))
     {
         convert_type = COPY;
+        enable_copy = TRUE;
     }
     else
     {
+        enable_copy = FALSE;
         convert_type = SCALE;
     }
 
@@ -900,7 +865,7 @@ static vsi_status _query_kernel
         kernel->info.parameters = vxPreProcessYuv420Kernel_param_def;
         kernel->info.numParams = _cnt_of_array( vxPreProcessYuv420Kernel_param_def );
 
-        if (enable_copy && (output_dtype == U8 || output_dtype == F16))
+        if (enable_copy)
         {
             kernel->info.initialize = _pre_process_yuv420_copy_initializer;
         }
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c
new file mode 100644
index 0000000..ca76dfe
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv422_evis.c
@@ -0,0 +1,623 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_eltwise.h"
+
+__BEGIN_DECLS
+
+#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_SCALE_U8TOF16    CVIVANTE_NAMESPACE("evis.pre_process_yuv422_scale_U8toF16")
+#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_SCALE_U8TOI16    CVIVANTE_NAMESPACE("evis.pre_process_yuv422_scale_U8toI16")
+#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_SCALE_U8TOU8     CVIVANTE_NAMESPACE("evis.pre_process_yuv422_scale_U8toU8")
+#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_SCALE_U8TOI8     CVIVANTE_NAMESPACE("evis.pre_process_yuv422_scale_U8toI8")
+#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_COPY_U8TOU8      CVIVANTE_NAMESPACE("evis.pre_process_yuv422_copy_U8toU8")
+#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_COPY_U8TOI8      CVIVANTE_NAMESPACE("evis.pre_process_yuv422_copy_U8toI8")
+#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_COPY_U8TOI16     CVIVANTE_NAMESPACE("evis.pre_process_yuv422_copy_U8toI16")
+#define VX_KERNEL_NAME_PRE_PROCESS_YUV422_COPY_U8TOF16     CVIVANTE_NAMESPACE("evis.pre_process_yuv422_copy_U8toF16")
+
+#define KERNEL_SOURCE_1    "pre_process_yuv422_copy",
+#define KERNEL_SOURCE_2    "pre_process_yuv422_scale",
+
+typedef enum
+{
+    COPY = 0,
+    SCALE,
+    TRANS
+} vsi_nn_kernel_convert_type_e;
+
+
+// Add kernel hashtable here
+#define HASH_PRE_PROCESS_YUV422_KEY(_input0_type, _output_type, _convert_type) \
+    ((_input0_type << 24) | (_output_type << 16) | (_convert_type << 8))
+
+#define TENSOR_PRE_PROCESS_YUV422_KERNELS(IN0_TYPE, OUT_TYPE, CONVERT_TYPE, SOURCE) \
+    { HASH_PRE_PROCESS_YUV422_KEY(IN0_TYPE, OUT_TYPE, CONVERT_TYPE), \
+        VX_KERNEL_NAME_PRE_PROCESS_YUV422_##CONVERT_TYPE##_##IN0_TYPE##TO##OUT_TYPE, \
+        SOURCE },
+
+static const struct {
+        uint32_t key;
+        char* function_name;
+        const char* source_name;
+    } pre_process_yuv422_map[] =
+{
+    TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, U8,  COPY,         KERNEL_SOURCE_1)
+    TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, I8,  COPY,         KERNEL_SOURCE_1)
+    TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, I16, COPY,         KERNEL_SOURCE_1)
+    TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, F16, COPY,         KERNEL_SOURCE_1)
+    TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, U8,  SCALE,        KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, I8,  SCALE,        KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, F16, SCALE,        KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_YUV422_KERNELS(U8, I16, SCALE,        KERNEL_SOURCE_2)
+};
+
+static vx_param_description_t vxPreProcessyuv422Kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _EVIS_PRE_PROCESS_YUV422_PARAM_NUM    _cnt_of_array(vxPreProcessyuv422Kernel_param_def)
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_pre_process_yuv422_copy_initializer)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        3,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    float       output_zp      = 0;
+    float       output_scale   = 1;
+    int32_t     reorder     = 0;
+    int32_t     order1      = 2;
+    uint32_t    width       = 0;
+    uint32_t    height      = 0;
+    float       bMean = 0.0f, gMean= 0.0f, rMean = 0.0f, var = 0.0f;
+    float       outputScaleVar = 0.0f;
+    float       bMeanScaleVarZp = 0.0f,  gMeanScaleVarZp = 0.0f,  rMeanScaleVarZp = 0.0f;
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    vsi_size_array_t * out_shape = NULL;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[6], &rMean);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &gMean);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &bMean);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &var);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    out_shape  = attr[0]->shape;
+    output_scale = 1.0f / attr[0]->scale;
+    output_zp = (float)attr[0]->zero_point;
+    width      = (uint32_t)(out_shape->data[0]);
+    height     = (uint32_t)(out_shape->data[1]);
+
+    if (reorder != 0)
+    {
+        reorder = 2;
+        order1 = 0;
+    }
+
+    outputScaleVar = output_scale * var;
+    bMeanScaleVarZp = output_zp - bMean * outputScaleVar;
+    gMeanScaleVarZp = output_zp - gMean * outputScaleVar;
+    rMeanScaleVarZp = output_zp - rMean * outputScaleVar;
+
+    shaderParam.global_scale[0]  = 4;
+    shaderParam.global_scale[1]  = 1;
+    shaderParam.global_scale[2]  = 1;
+    shaderParam.global_size[0]   = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
+        / shaderParam.global_scale[0], 4);
+    shaderParam.global_size[1]   = gpu_align_p2((height + shaderParam.global_scale[1] - 1)
+        / shaderParam.global_scale[1], 1);
+    shaderParam.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+                0x00003333, // TCfg
+                0x00000000, // ASelt
+                0x03020100, 0x00000000, // ABin
+                0x00000000, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniConvertYUV422toB_4x4 = {{
+                0x05050505, // TCfg
+                0x00000000, // ASelt
+                0x00120010, 0x00560054, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000,
+                0x3f1d3c00, 0x00000000, 0x3f1d3c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertYUV422toG_4x4 = {{
+                0x29292929, // TCfg
+                0x00000000, // ASelt
+                0x03120310, 0x07560754, // ABin
+                0x2a2a2a2a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc,
+                0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertYUV422toR_4x4 = {{
+                0x05050505, // TCfg
+                0x00000000, // ASelt
+                0x00320030, 0x00760074, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000,
+                0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractUVtoCharSub128_2x8 = {{
+                0x91919191, // TCfg
+                0x40404040, // ASelt
+                0x03020100, 0x07060504, // ABin
+                0xa2a2a2a2, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000700, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00010001, 0x00000001, 0x00010001,
+                0x00000001, 0x00010001, 0x00000001, 0x00010001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{
+            0x11111111, // TCfg
+            0x11110000, // ASelt
+            0x06040200, 0x06040200, // ABin
+            0x22222222, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000600, // AccumType, ConstantType, and PostShift
+            0x00000001, 0x00000001, 0x00000001, 0x00000001,
+            0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        status = vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toB_4x4", &uniConvertYUV422toB_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toG_4x4", &uniConvertYUV422toG_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toR_4x4", &uniConvertYUV422toR_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
+        status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar);
+        status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
+        status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
+        status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
+        switch( attr[0]->dtype )
+        {
+        case U8:
+        case I8:
+        case I16:
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        case F16:
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        default:
+            break;
+        }
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _pre_process_yuv422_copy_initializer() */
+
+
+DEF_KERNEL_INITIALIZER(_pre_process_yuv422_initializer)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        3,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    float       output_zp     = 0;
+    float       output_scale  = 1;
+    int32_t     reorder       = 0;
+    int32_t     order1        = 2;
+    uint32_t    width         = 0;
+    uint32_t    height        = 0;
+    uint32_t    roi_width     = 0;
+    uint32_t    roi_height    = 0;
+    uint32_t    xrIntFloat_16 = 0;
+    uint32_t    yrIntFloat_16 = 0;
+    int32_t     xRatio        = 0;
+    int32_t     yRatio        = 0;
+    float       bMean = 0.0f, gMean= 0.0f, rMean = 0.0f, var = 0.0f;
+    float       outputScaleVar = 0.0f;
+    float       bMeanScaleVarZp = 0.0f,  gMeanScaleVarZp = 0.0f,  rMeanScaleVarZp = 0.0f;
+
+    vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL };
+    vsi_size_array_t * out_shape = NULL;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &xRatio);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &yRatio);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[6], &rMean);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &gMean);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &bMean);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[9], &var);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &reorder);
+    CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+    out_shape  = attr[0]->shape;
+    output_scale = 1.0f / attr[0]->scale;
+    output_zp = (float)attr[0]->zero_point;
+    width      = (uint32_t)(out_shape->data[0]);
+    height     = (uint32_t)(out_shape->data[1]);
+
+    if (reorder != 0)
+    {
+        reorder = 2;
+        order1 = 0;
+    }
+
+    roi_width = (xRatio * width) >> 15;
+    roi_height = (yRatio * height) >> 15;
+    xrIntFloat_16 = (uint32_t)((roi_width << 16) / width + 1);
+    yrIntFloat_16 = (uint32_t)((roi_height << 16) / height + 1);
+
+    outputScaleVar = output_scale * var;
+    bMeanScaleVarZp = output_zp - bMean * outputScaleVar;
+    gMeanScaleVarZp = output_zp - gMean * outputScaleVar;
+    rMeanScaleVarZp = output_zp - rMean * outputScaleVar;
+
+    shaderParam.global_scale[0]  = 4;
+    shaderParam.global_scale[1]  = 1;
+    shaderParam.global_scale[2]  = 1;
+    shaderParam.global_size[0]   = gpu_align_p2((width + shaderParam.global_scale[0] - 1)
+        / shaderParam.global_scale[0], 4);
+    shaderParam.global_size[1]   = gpu_align_p2((height + shaderParam.global_scale[1] - 1)
+        / shaderParam.global_scale[1], 1);
+    shaderParam.global_size[2]   = 1;
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    {
+        gpu_dp_inst_t uniConvertInt32toUint8_2x8 = {{
+                0x33333333, // TCfg
+                0x11110000, // ASelt
+                0x03020100, 0x03020100, // ABin
+                0x00000000, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        gpu_dp_inst_t uniConvertYUV422toB_4x4 = {{
+                0x05050505, // TCfg
+                0x04040404, // ASelt
+                0x00110000, 0x00330022, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x3f323c00, 0x00000000, 0x3f323c00, 0x00000000,
+                0x3f323c00, 0x00000000, 0x3f323c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertYUV422toG_4x4 = {{
+                0x29292929, // TCfg
+                0x14141414, // ASelt
+                0x05110400, 0x07330622, // ABin
+                0x2a2a2a2a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc,
+                0x35873c00, 0x000039bc, 0x35873c00, 0x000039bc // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertYUV422toR_4x4 = {{
+                0x05050505, // TCfg
+                0x04040404, // ASelt
+                0x00510040, 0x00730062, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000,
+                0x3da03c00, 0x00000000, 0x3da03c00, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniExtractUVtoCharSub128_2x8 = {{
+                0x99999999, // TCfg
+                0x44444444, // ASelt
+                0x03020100, 0x07060504, // ABin
+                0xaaaaaaaa, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000400, // AccumType, ConstantType, and PostShift
+                0x00010001, 0x00010001, 0x00010001, 0x00010001,
+                0x00010001, 0x00010001, 0x00010001, 0x00010001 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniConvertHalftoFp16_2x8 = {{
+                0x11111111, // TCfg
+                0x11110000, // ASelt
+                0x06040200, 0x06040200, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000600, // AccumType, ConstantType, and PostShift
+                0x00000001, 0x00000001, 0x00000001, 0x00000001,
+                0x00000001, 0x00000001, 0x00000001, 0x00000001 // Constant
+        }, GPU_DP_TYPE_16 };
+
+        status = vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toB_4x4", &uniConvertYUV422toB_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toG_4x4", &uniConvertYUV422toG_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertYUV422toR_4x4", &uniConvertYUV422toR_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "xrIntFloat_16", &xrIntFloat_16);
+        status |= vsi_nn_kernel_gpu_add_param(node, "yrIntFloat_16", &yrIntFloat_16);
+        status |= vsi_nn_kernel_gpu_add_param(node, "outputScaleVar", &outputScaleVar);
+        status |= vsi_nn_kernel_gpu_add_param(node, "bMeanScaleVarZp", &bMeanScaleVarZp);
+        status |= vsi_nn_kernel_gpu_add_param(node, "gMeanScaleVarZp", &gMeanScaleVarZp);
+        status |= vsi_nn_kernel_gpu_add_param(node, "rMeanScaleVarZp", &rMeanScaleVarZp);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniExtractUVtoCharSub128_2x8", &uniExtractUVtoCharSub128_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node, "rOrder", &reorder);
+        status |= vsi_nn_kernel_gpu_add_param(node, "bOrder", &order1);
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+
+        switch( attr[0]->dtype )
+        {
+        case U8:
+        case I8:
+        case I16:
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertInt32toUint8_2x8);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        case F16:
+            {
+                status = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniConvertHalftoFp16_2x8);
+                CHECK_STATUS_FAIL_GOTO(status, OnError );
+            }
+            break;
+        default:
+            break;
+        }
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    return status;
+} /* _pre_process_yuv422_initializer() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_tensor_t* const* const inputs,
+    vsi_nn_tensor_t* const* const outputs,
+    vsi_nn_kernel_t* kernel,
+    const vsi_nn_kernel_param_t * params,
+    int32_t scale_x
+    )
+{
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    vsi_nn_kernel_dtype_e output_dtype = U8;
+    vsi_nn_kernel_convert_type_e convert_type = SCALE;
+    vsi_status status = VSI_FAILURE;
+    uint32_t key = 0;
+    int i = 0;
+    vsi_bool enable_copy  = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (enable_copy)
+    {
+        convert_type = COPY;
+    }
+    else
+    {
+        convert_type = SCALE;
+    }
+
+    key = HASH_PRE_PROCESS_YUV422_KEY( input0_dtype, output_dtype, convert_type );
+
+    for ( i = 0; i < _cnt_of_array(pre_process_yuv422_map); i ++ )
+    {
+        if ( pre_process_yuv422_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < _cnt_of_array(pre_process_yuv422_map) )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  pre_process_yuv422_map[i].function_name );
+        kernel->info.parameters = vxPreProcessyuv422Kernel_param_def;
+        kernel->info.numParams = _cnt_of_array( vxPreProcessyuv422Kernel_param_def );
+
+        if (convert_type == COPY)
+        {
+            kernel->info.initialize = _pre_process_yuv422_copy_initializer;
+        }
+        else
+        {
+            kernel->info.initialize = _pre_process_yuv422_initializer;
+        }
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                pre_process_yuv422_map[i].source_name );
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                pre_process_yuv422_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_YUV422_PARAM_NUM] = { NULL };
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_tensor_t* reshape_tensors[1] = {NULL};
+    int32_t trans = 0;
+    int32_t scale_x  = vsi_nn_kernel_param_get_int32( params, "scale_x" );
+
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num ) )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( inputs, outputs, kernel, params, scale_x );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            uint32_t index = 2;
+            int32_t scale_y  = vsi_nn_kernel_param_get_int32( params, "scale_y" );
+            int32_t left     = vsi_nn_kernel_param_get_int32( params, "left" );
+            int32_t top      = vsi_nn_kernel_param_get_int32( params, "top" );
+            float r_mean     = vsi_nn_kernel_param_get_float32( params, "r_mean" );
+            float g_mean     = vsi_nn_kernel_param_get_float32( params, "g_mean" );
+            float b_mean     = vsi_nn_kernel_param_get_float32( params, "b_mean" );
+            float rgb_scale  = vsi_nn_kernel_param_get_float32( params, "rgb_scale" );
+            int32_t reverse  = vsi_nn_kernel_param_get_int32( params, "reverse" );
+            int32_t yuv422_type = vsi_nn_kernel_param_get_int32( params, "yuv422_type" );
+
+            /* Pass parameters to node. */
+            vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_YUV422_PARAM_NUM,
+                    inputs, 1, outputs, 1 );
+
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_x );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &scale_y );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &left );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &top );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &r_mean );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &g_mean );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &b_mean );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, F32, &rgb_scale );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &reverse );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &trans );
+            tmp_params[index++] = vsi_nn_kernel_scalar_create( graph, I32, &yuv422_type );
+            status = vsi_nn_kernel_node_pass_param( node, tmp_params, _EVIS_PRE_PROCESS_YUV422_PARAM_NUM );
+            CHECK_STATUS(status);
+            vsi_nn_kernel_scalar_release( &tmp_params[2] );
+            vsi_nn_kernel_scalar_release( &tmp_params[3] );
+            vsi_nn_kernel_scalar_release( &tmp_params[4] );
+            vsi_nn_kernel_scalar_release( &tmp_params[5] );
+            vsi_nn_kernel_scalar_release( &tmp_params[6] );
+            vsi_nn_kernel_scalar_release( &tmp_params[7] );
+            vsi_nn_kernel_scalar_release( &tmp_params[8] );
+            vsi_nn_kernel_scalar_release( &tmp_params[9] );
+            vsi_nn_kernel_scalar_release( &tmp_params[10] );
+            vsi_nn_kernel_scalar_release( &tmp_params[11] );
+            vsi_nn_kernel_scalar_release( &tmp_params[12] );
+        }
+    }
+    vsi_safe_release_tensor(reshape_tensors[0]);
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( pre_process_yuv422, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c
index 7a3eeed..be1cd09 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c
@@ -361,7 +361,7 @@ DEF_KERNEL_INITIALIZER(_resize_1d_nearest_initializer)
             0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
         }, GPU_DP_TYPE_16};
 
-        gpu_quantize_multiplier_16bit(input_scale * output_scale, &M0, &postShift);
+        gpu_quantize_multiplier_16bit((double)input_scale * (double)output_scale, &M0, &postShift);
 
         multAndoutZP[0] = (uint32_t)(M0);
         multAndoutZP[1] = (uint32_t)((outputZP << postShift) - inputZP * M0);
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
index 6896307..1e79cbf 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
@@ -1202,7 +1202,7 @@ DEF_KERNEL_INITIALIZER(_bilinear_align_corners_opt_initializer)
 
     if ((U8 == input_dtype) && (_is_same_quant(input_attr, output_attr)))
     {
-        is_8x_align_corners = (scale_factor[0] == scale_factor[1]) && (scale_factor[0] = 0.125f);
+        is_8x_align_corners = (scale_factor[0] == scale_factor[1]) && (scale_factor[0] == 0.125f);
     }
 
     if (is_8x_align_corners)
@@ -1595,6 +1595,37 @@ OnError:
     return scale;
 }
 
+static vsi_bool _is_image_width_lt16
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t               *input,
+    int32_t                       pad_left,
+    int32_t                       pad_right
+    )
+{
+    vsi_nn_kernel_dtype_e in_dtype = vsi_nn_kernel_map_dtype( input->attr.dtype.vx_type );
+    vsi_size_t width = input->attr.size[0];
+    size_t bytes = vsi_nn_kernel_dtype_get_bytes(in_dtype);
+    vsi_size_t max_cross_read_img_width = bytes == 1 ? 16 : 8;
+
+    if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver)
+    {
+        return FALSE;
+    }
+
+    if (pad_left <= 0 || pad_right <= 0)
+    {
+        return FALSE;
+    }
+
+    if (width + pad_left + pad_right > max_cross_read_img_width )
+    {
+        return FALSE;
+    }
+
+    return TRUE;
+}
+
 static vsi_nn_kernel_node_t _setup
     (
     vsi_nn_graph_t              * graph,
@@ -1615,6 +1646,13 @@ static vsi_nn_kernel_node_t _setup
     vsi_bool is_evis2           = (vsi_bool)(graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_2);
     vsi_bool is_run_opt_kernel  = FALSE;
     vsi_nn_tensor_t*  scale     = NULL;
+    int32_t pad_left = half_pixel_centers ? 1 : 0;
+    int32_t pad_right = half_pixel_centers ? 1 : 0;
+
+    if (_is_image_width_lt16(graph, inputs[0], pad_left, pad_right))
+    {
+        return NULL;
+    }
 
     status = _query_kernel( kernel, inputs, outputs, is_same_type, is_evis2,
                             align_corners, half_pixel_centers, &is_run_opt_kernel);
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c
index 1b6d094..4d01893 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c
@@ -371,7 +371,7 @@ DEF_KERNEL_INITIALIZER(_resize_nearest_initializer)
             0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
         }, GPU_DP_TYPE_16};
 
-        gpu_quantize_multiplier_16bit(input_scale * output_scale, &M0, &postShift);
+        gpu_quantize_multiplier_16bit((double)input_scale * (double)output_scale, &M0, &postShift);
 
         multAndoutZP[0] = (uint32_t)(M0);
         multAndoutZP[1] = (uint32_t)((outputZP << postShift) - inputZP * M0);
diff --git a/src/tim/vx/internal/src/kernel/evis/select_evis.c b/src/tim/vx/internal/src/kernel/evis/select_evis.c
index 897f106..fae6ad7 100644
--- a/src/tim/vx/internal/src/kernel/evis/select_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/select_evis.c
@@ -34,7 +34,6 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
@@ -82,6 +81,15 @@ static const _kernel_map_type _select_kernel_map[] =
     PACK_KERNEL_MAP(I8, F16, I16, F16),
     PACK_KERNEL_MAP(I8, I16, F16, F16),
     PACK_KERNEL_MAP(I8, F16, F16, U8),
+    PACK_KERNEL_MAP(I8, U8,  F16, U8),
+    PACK_KERNEL_MAP(I8, F16, U8,  U8),
+    PACK_KERNEL_MAP(I8, I8,  F16, I8),
+    PACK_KERNEL_MAP(I8, F16, I8,  I8),
+    PACK_KERNEL_MAP(I8, I16, F16, I16),
+    PACK_KERNEL_MAP(I8, F16, I16, I16),
+    PACK_KERNEL_MAP(I8, I8,  I8,  F16),
+    PACK_KERNEL_MAP(I8, U8,  U8,  F16),
+    PACK_KERNEL_MAP(I8, I16, I16, F16),
     PACK_KERNEL_MAP_2D(I8, I8,  I8,  I8),
     PACK_KERNEL_MAP_2D(I8, U8,  U8,  U8),
     PACK_KERNEL_MAP_2D(I8, I16, I16, I16),
@@ -93,6 +101,15 @@ static const _kernel_map_type _select_kernel_map[] =
     PACK_KERNEL_MAP_2D(I8, F16, I16, F16),
     PACK_KERNEL_MAP_2D(I8, I16, F16, F16),
     PACK_KERNEL_MAP_2D(I8, F16, F16, U8),
+    PACK_KERNEL_MAP_2D(I8, U8,  F16, U8),
+    PACK_KERNEL_MAP_2D(I8, F16, U8,  U8),
+    PACK_KERNEL_MAP_2D(I8, I8,  F16, I8),
+    PACK_KERNEL_MAP_2D(I8, F16, I8,  I8),
+    PACK_KERNEL_MAP_2D(I8, I16, F16, I16),
+    PACK_KERNEL_MAP_2D(I8, F16, I16, I16),
+    PACK_KERNEL_MAP_2D(I8, I8,  I8,  F16),
+    PACK_KERNEL_MAP_2D(I8, U8,  U8,  F16),
+    PACK_KERNEL_MAP_2D(I8, I16, I16, F16),
 };
 
 /*
@@ -248,16 +265,26 @@ DEF_KERNEL_INITIALIZER(_select_initializer)
             CHECK_STATUS_FAIL_GOTO(status, final );
         }
         break;
-        case _PACK_SELECT_KEY( I8,  I8,  I8 ):
-        case _PACK_SELECT_KEY( I16, I16, I16 ):
-        case _PACK_SELECT_KEY( U8,  U8,  U8 ):
-        case _PACK_SELECT_KEY( I8,  F16, F16 ):
-        case _PACK_SELECT_KEY( U8,  F16, F16 ):
-        case _PACK_SELECT_KEY( I16, F16, F16 ):
-        case _PACK_SELECT_KEY( F16, U8,  F16 ):
-        case _PACK_SELECT_KEY( F16, I8,  F16 ):
-        case _PACK_SELECT_KEY( F16, I16, F16 ):
-        case _PACK_SELECT_KEY( F16, F16, U8 ):
+        case _PACK_SELECT_KEY( I8,   I8,   I8 ):
+        case _PACK_SELECT_KEY( I16,  I16,  I16 ):
+        case _PACK_SELECT_KEY( U8,   U8,   U8 ):
+        case _PACK_SELECT_KEY( I8,   F16,  F16 ):
+        case _PACK_SELECT_KEY( U8,   F16,  F16 ):
+        case _PACK_SELECT_KEY( I16,  F16,  F16 ):
+        case _PACK_SELECT_KEY( F16,  U8,   F16 ):
+        case _PACK_SELECT_KEY( F16,  I8,   F16 ):
+        case _PACK_SELECT_KEY( F16,  I16,  F16 ):
+        case _PACK_SELECT_KEY( F16,  F16,  U8 ):
+        case _PACK_SELECT_KEY( U8,   F16,  U8 ):
+        case _PACK_SELECT_KEY( F16,  U8,   U8 ):
+        case _PACK_SELECT_KEY( I8,   F16,  I8 ):
+        case _PACK_SELECT_KEY( F16,  I8,   I8 ):
+        case _PACK_SELECT_KEY( I16,  F16,  I16 ):
+        case _PACK_SELECT_KEY( F16,  I16,  I16 ):
+        case _PACK_SELECT_KEY( I8,   I8,   F16 ):
+        case _PACK_SELECT_KEY( I16,  I16,  F16 ):
+        case _PACK_SELECT_KEY( U8,   U8,   F16 ):
+        case _PACK_SELECT_KEY( BF16, BF16, BF16 ):
         {
             uint32_t multAndoutZP0[2] = {0};
             uint32_t multAndoutZP1[2] = {0};
@@ -367,9 +394,12 @@ static vsi_status _query_kernel
     out_dtype   = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
     cond_dtype  = (BOOL8 == cond_dtype || U8 == cond_dtype) ? I8 : cond_dtype;
-    in0_dtype   = (BOOL8 == in0_dtype)  ? I8 : in0_dtype;
-    in1_dtype   = (BOOL8 == in1_dtype)  ? I8 : in1_dtype;
-    out_dtype   = (BOOL8 == out_dtype)  ? I8 : out_dtype;
+    in0_dtype   = (BOOL8 == in0_dtype) ? I8 : in0_dtype;
+    in0_dtype   = (BF16 == in0_dtype)  ? I16 : in0_dtype;
+    in1_dtype   = (BOOL8 == in1_dtype) ? I8 : in1_dtype;
+    in1_dtype   = (BF16 == in1_dtype) ? I16 : in1_dtype;
+    out_dtype   = (BOOL8 == out_dtype) ? I8 : out_dtype;
+    out_dtype   = (BF16 == out_dtype) ? I16 : out_dtype;
 
     key = SELECT_HASH_KEY(cond_dtype, in0_dtype, in1_dtype, out_dtype, image_2d);
 
@@ -415,7 +445,7 @@ static vsi_nn_kernel_node_t _setup
     vsi_bool image_2d = FALSE;
     vsi_nn_kernel_node_t node = NULL;
 
-    if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
         return NULL;
@@ -424,10 +454,10 @@ static vsi_nn_kernel_node_t _setup
     image_2d = (outputs[0]->attr.dim_num == 2);
     status = _query_kernel( kernel, inputs, outputs, image_2d);
 
-    if( VSI_SUCCESS == status)
+    if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
-        if( node )
+        if ( node )
         {
             /* Set inputs and outputs */
             vsi_nn_kernel_node_pack_io( node_params, _SELECT_PARAM_NUM,
diff --git a/src/tim/vx/internal/src/kernel/evis/tile_evis.c b/src/tim/vx/internal/src/kernel/evis/tile_evis.c
index b9e46cd..50e43cf 100644
--- a/src/tim/vx/internal/src/kernel/evis/tile_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/tile_evis.c
@@ -544,6 +544,8 @@ static vsi_nn_kernel_node_t _setup
             vsi_size_t depthOut = new_rank > 2 ? reshape_tensors[1]->attr.size[2] : 1;
             vsi_size_t batchIn = new_rank > 3 ? reshape_tensors[0]->attr.size[3] : 1;
 
+            shapes[1][2] = shapes[1][2] == 0 ? 1 : shapes[1][2];
+            shapes[1][3] = shapes[1][3] == 0 ? 1 : shapes[1][3];
 
             vsi_nn_kernel_node_pack_io( node_params, _EVIS_PARAM_NUM,
                     &reshape_tensors[0], 1, &reshape_tensors[1], 1 );
diff --git a/src/tim/vx/internal/src/kernel/sp/layer_norm_y_direction_sp.c b/src/tim/vx/internal/src/kernel/sp/layer_norm_y_direction_sp.c
new file mode 100644
index 0000000..79d2e3a
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/sp/layer_norm_y_direction_sp.c
@@ -0,0 +1,797 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_sp_unit_operation.h"
+#include "kernel/vsi_nn_sp_lut.h"
+
+#if (VX_STREAM_PROCESSOR_SUPPORT)
+
+vsi_nn_spinst_t * vsi_nn_sp_moments_axis1_inst
+    (
+        vx_context                context,
+        int32_t                   fifo_depth,
+        int32_t                   max_vector_depth
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    const int32_t spInitInstsNum = fifo_depth == 1 ? 4 : 3;
+    const int32_t spLoopInstsNum = fifo_depth == 2 ? 4 : 3;
+    const int32_t spCompleteInstsNum = fifo_depth == 1 ? 3 : 0;
+    const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum + spCompleteInstsNum;
+
+    vsi_nn_spinst_t *spinst = NULL;
+    vsi_nn_spinst_inst_param sp_insts_param[11];
+    vsi_nn_spinst_attr_t attr;
+
+    memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
+    vsi_nn_init_spinst_attr(&attr);
+
+    if (fifo_depth == 1)
+    {
+        /* init inst0: r3 = 0 */
+        status = vsi_nn_sp_move_constant(&sp_insts_param[0], 0, VSI_NN_SP_SR3);
+        /* init inst1: r1 = 0 */
+        status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 0, VSI_NN_SP_SR1);
+        /* init inst2: r4 = 0 */
+        status |= vsi_nn_sp_move_constant(&sp_insts_param[2], 0, VSI_NN_SP_SR4);
+        /* init inst3: nop */
+        status |= vsi_nn_sp_nop(&sp_insts_param[3]);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+
+        /* loop inst0: r5 = r1 * r1 || r1 = in */
+        status  = vsi_nn_sp_mul(&sp_insts_param[4], VSI_NN_SP_SR1, VSI_NN_SP_SR1, VSI_NN_SP_SR5);
+        status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_SRIN, VSI_NN_SP_SR1);
+        /* loop inst1: r3 = r3 + r1 || out = r1 */
+        status |= vsi_nn_sp_add(&sp_insts_param[5], VSI_NN_SP_SR3, VSI_NN_SP_SR1, VSI_NN_SP_SR3);
+        status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SROUT);
+        /* loop inst2: r5 = r5 + r4 */
+        status |= vsi_nn_sp_add(&sp_insts_param[6], VSI_NN_SP_SR5, VSI_NN_SP_SR4, VSI_NN_SP_SR5);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+
+        /* complete inst0: v11 = r3 */
+        status  = vsi_nn_sp_move(&sp_insts_param[7], VSI_NN_SP_SR3, VSI_NN_SP_VR11);
+        /* complete inst1: r3 = r3 + r1 || out = r1 */
+        status |= vsi_nn_sp_nop(&sp_insts_param[8]);
+        /* complete inst2: v12 = r4 */
+        status  = vsi_nn_sp_move(&sp_insts_param[9], VSI_NN_SP_SR4, VSI_NN_SP_VR12);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+
+        attr.flush_cycle_num = 8;
+    }
+    else if (fifo_depth == 2)
+    {
+        /* init inst0: r3 = 0 */
+        status = vsi_nn_sp_move_constant(&sp_insts_param[0], 0, VSI_NN_SP_SR3);
+        /* init inst1: r2 = 1 */
+        status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 1, VSI_NN_SP_SR2);
+        /* init inst2: r4 = 0 */
+        status |= vsi_nn_sp_move_constant(&sp_insts_param[2], 0, VSI_NN_SP_SR4);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+
+        /* loop inst0: out = r2 * r1 || v11 = r1 + r3 |  r1 = in */
+        status  = vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_SR2, VSI_NN_SP_SR1, VSI_NN_SP_SROUT);
+        status |= vsi_nn_sp_add(&sp_insts_param[3], VSI_NN_SP_SR1, VSI_NN_SP_SR3, VSI_NN_SP_VR11);
+        status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_SRIN, VSI_NN_SP_SR1);
+        /* loop inst1: v12 = r4 + r5 | r3 = v11 */
+        status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_SR4, VSI_NN_SP_SR5, VSI_NN_SP_VR12);
+        status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_VR11, VSI_NN_SP_SR3);
+        /* loop inst2: r4 = v12 */
+        status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_VR12, VSI_NN_SP_SR4);
+        /* loop inst3: r5 = r1 * r1 */
+        status |= vsi_nn_sp_mul(&sp_insts_param[6], VSI_NN_SP_SR1, VSI_NN_SP_SR1, VSI_NN_SP_SR5);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+
+        attr.flush_cycle_num = 5;
+
+        attr.ignored_leading_v11_rd = fifo_depth;
+        attr.ignored_leading_v12_rd = fifo_depth;
+        attr.ignored_leading_v11_wr = 1;
+        attr.ignored_leading_v12_wr = 1;
+
+        attr.num_of_v11_rd_in_flush_cycle = 1;
+        attr.num_of_v12_rd_in_flush_cycle = 1;
+        attr.num_of_v11_wr_in_flush_cycle = 1;
+        attr.num_of_v12_wr_in_flush_cycle = 2;
+    }
+    else
+    {
+        /* init inst0: r3 = 0 */
+        status = vsi_nn_sp_move_constant(&sp_insts_param[0], 0, VSI_NN_SP_SR3);
+        /* init inst1: r2 = 0 */
+        status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 0, VSI_NN_SP_SR2);
+        /* init inst2: r4 = 0 */
+        status |= vsi_nn_sp_move_constant(&sp_insts_param[2], 0, VSI_NN_SP_SR4);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+
+        /* loop inst0: r5 = r1 * r1 | out = r2 + r1 || r1 = in */
+        status  = vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_SR1, VSI_NN_SP_SR1, VSI_NN_SP_SR5);
+        status |= vsi_nn_sp_add(&sp_insts_param[3], VSI_NN_SP_SR2, VSI_NN_SP_SR1, VSI_NN_SP_SROUT);
+        status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_SRIN, VSI_NN_SP_SR1);
+        /* loop inst1: v11 = r1 + r3 | r3 = v11 */
+        status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_SR1, VSI_NN_SP_SR3, VSI_NN_SP_VR11);
+        status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_VR11, VSI_NN_SP_SR3);
+        /* loop inst2: v12 = r4 + r5 | r4 = v12 */
+        status |= vsi_nn_sp_add(&sp_insts_param[5], VSI_NN_SP_SR4, VSI_NN_SP_SR5, VSI_NN_SP_VR12);
+        status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_VR12, VSI_NN_SP_SR4);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+
+        attr.ignored_leading_v11_rd = fifo_depth;
+        attr.ignored_leading_v12_rd = fifo_depth;
+        attr.ignored_leading_v11_wr = 1;
+        attr.ignored_leading_v12_wr = 1;
+
+        attr.num_of_v11_rd_in_flush_cycle = 1;
+        attr.num_of_v12_rd_in_flush_cycle = 1;
+        attr.num_of_v11_wr_in_flush_cycle = 2;
+        attr.num_of_v12_wr_in_flush_cycle = 2;
+
+        attr.flush_cycle_num = 5;
+    }
+
+    attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_YZMERGE;
+    attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
+
+    attr.prog_init_instr_num = spInitInstsNum;
+    attr.prog_loop_instr_num = spLoopInstsNum;
+    attr.prog_complete_instr_num = spCompleteInstsNum;
+    attr.ignored_leading_outputs = 1;
+    attr.v11_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
+    attr.v12_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
+
+    attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_X;
+    attr.split_tilex_equal_imgx = TRUE;
+    attr.split_max_vector_depth = max_vector_depth;
+
+    spinst = vsi_nn_create_spinst_by_context(context);
+    CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
+    status  = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
+    status |= vsi_nn_set_spinst_attr(spinst, attr);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+final:
+    return spinst;
+}
+
+DEF_SP_KERNEL_QUERY(moements_axis1_query)
+    (
+    vsi_nn_kernel_node_t        node
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vx_size index = 0;
+    vx_size tile_size[2] = {0};
+    vsi_nn_spinst_t *spinst = NULL;
+    int32_t fifo_depth = 0;
+    int32_t max_vector_depth = 0;
+    vx_context  ctx = vxGetContext((vx_reference)node);
+    vx_hardware_caps_params_ext2_t hw_param;
+
+    memset(&hw_param, 0, sizeof(vx_hardware_caps_params_ext2_t));
+    status = vxQueryHardwareCaps(ctx, (vx_hardware_caps_params_t*)(&hw_param), sizeof(vx_hardware_caps_params_ext2_t));
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+    status = vxQueryNode(node, VX_NODE_SWTILING_TILE_XY, tile_size, sizeof(tile_size));
+    CHECK_STATUS_FAIL_GOTO( status, final );
+    status = vxQueryNode(node, VX_NODE_SPINST_INDEX, &index, sizeof(index));
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+    fifo_depth = (int32_t)ceil((float)tile_size[0] / (float)hw_param.streamProcessorExecCount);
+    max_vector_depth = hw_param.streamProcessorVectorSize;
+
+    spinst = vsi_nn_sp_moments_axis1_inst(ctx, fifo_depth, max_vector_depth);
+
+    status = vxSetParameterByIndex( node, (uint32_t)index, (vx_reference)spinst->sp );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+final:
+    if (spinst)
+    {
+        vsi_nn_release_spinst(&spinst);
+    }
+
+    return status;
+}
+
+vsi_nn_kernel_node_t vsi_nn_sp_moments_axis1_node
+    (
+        vsi_nn_graph_t              * graph,
+        vsi_nn_tensor_t             * input,
+        vsi_nn_tensor_t             * output0,
+        vsi_nn_tensor_t             * output1
+    )
+{
+    const uint32_t input_count = 1;
+    const uint32_t output_count = 2;
+    vx_tensor inputs_tensor[1] = {NULL};
+    vx_tensor outputs_tensor[2] = {NULL};
+    vx_node node = NULL;
+    int32_t max_vector_depth = graph->ctx->config.sp_vector_depth;
+    int32_t fifo_depth = 4;
+
+    vsi_nn_spinst_t *spinst = NULL;
+
+    spinst = vsi_nn_sp_moments_axis1_inst(graph->ctx->c, fifo_depth, max_vector_depth);
+
+    inputs_tensor[0] = input->t;
+    outputs_tensor[0] = output0->t;
+    outputs_tensor[1] = output1->t;
+    node = vxStreamProcessorNode(
+        graph->g,
+        inputs_tensor,
+        input_count,
+        outputs_tensor,
+        output_count,
+        spinst->sp,
+        NULL);
+
+    if (node)
+    {
+        vxAssignNodeQueryCallback(node, moements_axis1_query);
+    }
+
+    if (spinst)
+    {
+        vsi_nn_release_spinst(&spinst);
+    }
+
+    return (vsi_nn_kernel_node_t)node;
+}
+
+vsi_nn_kernel_node_t vsi_nn_sp_ln_means_axis1_node
+    (
+        vsi_nn_graph_t  * graph,
+        vsi_nn_tensor_t * input,
+        vsi_nn_tensor_t * output,
+        float             inv_m,
+        float             const_a,
+        float             s,
+        float             eps,
+        float             output_scale
+    )
+{
+    const int32_t spInitInstsNum = 2;
+    const int32_t spLoopInstsNum = 5;
+    const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum;
+
+    const uint32_t input_count = 1;
+    const uint32_t output_count = 1;
+    vx_tensor inputs_tensor[1] = {NULL};
+    vx_tensor outputs_tensor[1] = {NULL};
+    vx_node node = NULL;
+    int32_t max_vector_depth = graph->ctx->config.sp_vector_depth;
+
+    vsi_nn_spinst_t *spinst = NULL;
+    vsi_nn_spinst_inst_param sp_insts_param[7];
+    vsi_nn_spinst_attr_t attr;
+    vsi_nn_sp_lut_params sp_lut_params;
+    vx_lut_params_s vx_lut_params;
+
+    vsi_status status = VSI_FAILURE;
+
+    memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
+    vsi_nn_init_spinst_attr(&attr);
+    memset(&sp_lut_params, 0, sizeof(vsi_nn_sp_lut_params));
+    memset(&vx_lut_params, 0, sizeof(vx_lut_params_s));
+
+    /* init inst0: r2 = const_a */
+    status  = vsi_nn_sp_move_constant(&sp_insts_param[0], const_a, VSI_NN_SP_SR2);
+    /* init inst1: r3 = inv_m */
+    status  = vsi_nn_sp_move_constant(&sp_insts_param[1], inv_m, VSI_NN_SP_SR3);
+    /* loop inst0: r4 = v11 * v11 || r6 = r4 + r5 || r5 = v11*/
+    status  = vsi_nn_sp_mul(&sp_insts_param[2], VSI_NN_SP_VR11, VSI_NN_SP_VR11, VSI_NN_SP_SR4);
+    status |= vsi_nn_sp_add(&sp_insts_param[2], VSI_NN_SP_SR4, VSI_NN_SP_SR5, VSI_NN_SP_SR6);
+    status |= vsi_nn_sp_move(&sp_insts_param[2], VSI_NN_SP_VR11, VSI_NN_SP_SR5);
+    /* loop inst1: r1 = pwlMul() || r7 = pwlAdd() */
+    status |= vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_PWLMUL, VSI_NN_SP_PWLMUL, VSI_NN_SP_SR1);
+    status |= vsi_nn_sp_sub(&sp_insts_param[3], VSI_NN_SP_PWLADD, VSI_NN_SP_PWLADD, VSI_NN_SP_SR7);
+    /* loop inst2: r5 = r2 * v12 || v12 = r8 + r7 */
+    status |= vsi_nn_sp_mul(&sp_insts_param[4], VSI_NN_SP_SR2, VSI_NN_SP_VR12, VSI_NN_SP_SR5);
+    status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_SR8, VSI_NN_SP_SR7, VSI_NN_SP_VR12);
+    /* loop inst3: r1 = setup(r6) || v11 = r3 * r5 || r7 = r1 */
+    status |= vsi_nn_sp_pwl_setup0(&sp_insts_param[5], VSI_NN_SP_SR6, VSI_NN_SP_SR1);
+    status |= vsi_nn_sp_mul(&sp_insts_param[5], VSI_NN_SP_SR3, VSI_NN_SP_SR5, VSI_NN_SP_VR11);
+    status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR7);
+    /* loop inst3: r8 = r1 * r7 */
+    status |= vsi_nn_sp_mul(&sp_insts_param[6], VSI_NN_SP_SR1, VSI_NN_SP_SR7, VSI_NN_SP_SR8);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_YZMERGE;
+
+    attr.input_setup = VSI_NN_SP_INPUT_SETUP_V11;
+    attr.prog_init_instr_num = spInitInstsNum;
+    attr.prog_loop_instr_num = spLoopInstsNum;
+    attr.ignored_leading_outputs = 0;
+    attr.ignored_leading_v11_wr = 0;
+    attr.ignored_leading_v12_wr = 3;
+    attr.ignored_leading_v11_rd = 0;
+    attr.flush_cycle_num = 17;
+
+    attr.num_of_v11_rd_in_flush_cycle = 0;
+    attr.num_of_v12_rd_in_flush_cycle = 1;
+    attr.num_of_v11_wr_in_flush_cycle = 1;
+    attr.num_of_v12_wr_in_flush_cycle = 4;
+
+    attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_X;
+    attr.split_tilex_equal_imgx = TRUE;
+    attr.split_max_vector_depth = max_vector_depth;
+
+    spinst = vsi_nn_create_spinst(graph);
+    CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
+    status  = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
+    status |= vsi_nn_set_spinst_attr(spinst, attr);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    inputs_tensor[0] = input->t;
+    outputs_tensor[0] = output->t;
+
+    vx_lut_params.lut_function = VX_NN_ACTIVATION_CUSTOM;
+    vx_lut_params.in_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE);
+    vx_lut_params.out_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE);
+
+    sp_lut_params.act_type = VSI_NN_SP_ACT_LINEAR_RSQRT;
+    sp_lut_params.params[0] = s;
+    sp_lut_params.params[1] = eps;
+    sp_lut_params.params[2] = output_scale;
+    vsi_nn_sp_lut(vx_lut_params.in_lut, vx_lut_params.out_lut, &sp_lut_params);
+
+    node = vxStreamProcessorNode(
+        graph->g,
+        inputs_tensor,
+        input_count,
+        outputs_tensor,
+        output_count,
+        spinst->sp,
+        &vx_lut_params);
+
+final:
+    if (spinst)
+    {
+        vsi_nn_release_spinst(&spinst);
+    }
+
+    if (vx_lut_params.in_lut)
+    {
+        vxReleaseLUT(&vx_lut_params.in_lut);
+        vx_lut_params.in_lut = NULL;
+    }
+    if (vx_lut_params.out_lut)
+    {
+        vxReleaseLUT(&vx_lut_params.out_lut);
+        vx_lut_params.out_lut = NULL;
+    }
+
+    return (vsi_nn_kernel_node_t)node;
+}
+
+vsi_nn_spinst_t * vsi_nn_sp_layer_norm_axis1_inst
+    (
+        vx_context                context,
+        int32_t                   fifo_depth,
+        int32_t                   max_vector_depth
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    const int32_t spInitInstsNum = 0;
+    const int32_t spLoopInstsNum = fifo_depth > 3 ? 2 : 5;
+    const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum;
+
+    vsi_nn_spinst_t *spinst = NULL;
+    vsi_nn_spinst_inst_param sp_insts_param[5];
+    vsi_nn_spinst_attr_t attr;
+
+    memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
+    vsi_nn_init_spinst_attr(&attr);
+
+    if (fifo_depth > 3)
+    {
+        /* loop inst0: out = in - v11 || v11 = v11 */
+        status  = vsi_nn_sp_sub(&sp_insts_param[0], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR1);
+        status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_VR11, VSI_NN_SP_VR11);
+        /* loop inst1: out = r1 * v12 | v12 = v12 */
+        status |= vsi_nn_sp_mul(&sp_insts_param[1], VSI_NN_SP_SR1, VSI_NN_SP_VR12, VSI_NN_SP_SROUT);
+        status |= vsi_nn_sp_move(&sp_insts_param[1], VSI_NN_SP_VR12, VSI_NN_SP_VR12);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+
+        attr.flush_cycle_num = 3;
+        attr.ignored_leading_v12_rd = 1;
+        attr.ignored_leading_v12_wr = 1;
+
+        attr.num_of_v11_rd_in_flush_cycle = 0;
+        attr.num_of_v12_rd_in_flush_cycle = 2;
+        attr.num_of_v11_wr_in_flush_cycle = 0;
+        attr.num_of_v12_wr_in_flush_cycle = 2;
+    }
+    else
+    {
+        /* loop inst0: out = in - v11 || v11 = v11 */
+        status  = vsi_nn_sp_sub(&sp_insts_param[0], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR1);
+        status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_VR11, VSI_NN_SP_VR11);
+        /* loop inst1: nop */
+        status |= vsi_nn_sp_nop(&sp_insts_param[1]);
+        /* loop inst2: nop */
+        status |= vsi_nn_sp_nop(&sp_insts_param[2]);
+        /* loop inst3: out = r1 * v12 | v12 = v12 */
+        status |= vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_SR1, VSI_NN_SP_VR12, VSI_NN_SP_SROUT);
+        status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_VR12, VSI_NN_SP_VR12);
+        /* loop inst4: nop */
+        status |= vsi_nn_sp_nop(&sp_insts_param[4]);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+
+        attr.flush_cycle_num = 4;
+        attr.ignored_leading_v12_rd = 0;
+        attr.ignored_leading_v12_wr = 0;
+
+        attr.num_of_v11_rd_in_flush_cycle = 0;
+        attr.num_of_v12_rd_in_flush_cycle = 1;
+        attr.num_of_v11_wr_in_flush_cycle = 0;
+        attr.num_of_v12_wr_in_flush_cycle = 1;
+    }
+
+    attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_YZMERGE;
+    attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
+
+    attr.prog_init_instr_num = spInitInstsNum;
+    attr.prog_loop_instr_num = spLoopInstsNum;
+    attr.ignored_leading_outputs = 0;
+    attr.ignored_leading_v11_rd = 0;
+    attr.ignored_leading_v11_wr = 0;
+
+    attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_X;
+    attr.split_tilex_equal_imgx = TRUE;
+    attr.split_max_vector_depth = max_vector_depth;
+
+    spinst = vsi_nn_create_spinst_by_context(context);
+    CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
+    status  = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
+    status |= vsi_nn_set_spinst_attr(spinst, attr);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+final:
+    return spinst;
+}
+
+DEF_SP_KERNEL_QUERY(layer_norm_axis1_query)
+    (
+    vsi_nn_kernel_node_t        node
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vx_size index = 0;
+    vx_size tile_size[2] = {0};
+    vsi_nn_spinst_t *spinst = NULL;
+    int32_t fifo_depth = 0;
+    int32_t max_vector_depth = 0;
+    vx_context  ctx = vxGetContext((vx_reference)node);
+    vx_hardware_caps_params_ext2_t hw_param;
+
+    memset(&hw_param, 0, sizeof(vx_hardware_caps_params_ext2_t));
+    status = vxQueryHardwareCaps(ctx, (vx_hardware_caps_params_t*)(&hw_param), sizeof(vx_hardware_caps_params_ext2_t));
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+    status = vxQueryNode(node, VX_NODE_SWTILING_TILE_XY, tile_size, sizeof(tile_size));
+    CHECK_STATUS_FAIL_GOTO( status, final );
+    status = vxQueryNode(node, VX_NODE_SPINST_INDEX, &index, sizeof(index));
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+    fifo_depth = (int32_t)ceil((float)tile_size[0] / (float)hw_param.streamProcessorExecCount);
+    max_vector_depth = hw_param.streamProcessorVectorSize;
+
+    spinst = vsi_nn_sp_layer_norm_axis1_inst(ctx, fifo_depth, max_vector_depth);
+
+    status = vxSetParameterByIndex( node, (uint32_t)index, (vx_reference)spinst->sp );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+final:
+    if (spinst)
+    {
+        vsi_nn_release_spinst(&spinst);
+    }
+
+    return status;
+}
+
+vsi_nn_kernel_node_t vsi_nn_sp_layer_norm_axis1_node
+    (
+        vsi_nn_graph_t              * graph,
+        vsi_nn_tensor_t             * input0,
+        vsi_nn_tensor_t             * input1,
+        vsi_nn_tensor_t             * output
+    )
+{
+    const uint32_t input_count = 2;
+    const uint32_t output_count = 1;
+    vx_tensor inputs_tensor[2] = {NULL};
+    vx_tensor outputs_tensor[1] = {NULL};
+    vx_node node = NULL;
+    int32_t max_vector_depth = graph->ctx->config.sp_vector_depth;
+    int32_t fifo_depth = 4;
+    vsi_nn_spinst_t *spinst = NULL;
+
+    spinst = vsi_nn_sp_layer_norm_axis1_inst(graph->ctx->c, fifo_depth, max_vector_depth);
+
+    inputs_tensor[0] = input0->t;
+    inputs_tensor[1] = input1->t;
+    outputs_tensor[0] = output->t;
+    node = vxStreamProcessorNode(
+        graph->g,
+        inputs_tensor,
+        input_count,
+        outputs_tensor,
+        output_count,
+        spinst->sp,
+        NULL);
+
+    if (node)
+    {
+        vxAssignNodeQueryCallback(node, layer_norm_axis1_query);
+    }
+
+    if (spinst)
+    {
+        vsi_nn_release_spinst(&spinst);
+    }
+
+    return (vsi_nn_kernel_node_t)node;
+}
+
+vsi_nn_kernel_node_t vsi_nn_sp_load_weight_bias_node
+    (
+        vsi_nn_graph_t              * graph,
+        vsi_nn_tensor_t             * weight,
+        vsi_nn_tensor_t             * bias,
+        vsi_nn_tensor_t             * dummy_output
+    )
+{
+    const int32_t spLoopInstsNum = 2;
+    const int32_t spInstsNum = spLoopInstsNum;
+
+    const uint32_t input_count = 2;
+    const uint32_t output_count = 1;
+    vx_tensor inputs_tensor[2] = {NULL};
+    vx_tensor outputs_tensor[2] = {NULL};
+    vx_node node = NULL;
+    int32_t max_vector_depth = graph->ctx->config.sp_vector_depth /
+        graph->ctx->config.sp_exec_count;
+
+    vsi_nn_spinst_t *spinst = NULL;
+    vsi_nn_spinst_inst_param sp_insts_param[2];
+    vsi_nn_spinst_attr_t attr;
+
+    vsi_status status = VSI_FAILURE;
+
+    memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
+    vsi_nn_init_spinst_attr(&attr);
+
+    /* loop inst0: v11 = in*/
+    status  = vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_SRIN, VSI_NN_SP_VR11);
+    /* loop inst0: v12 = in*/
+    status |= vsi_nn_sp_move(&sp_insts_param[1], VSI_NN_SP_SRIN, VSI_NN_SP_VR12);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_YZMERGE;
+    attr.input_setup = VSI_NN_SP_INPUT_SETUP_INTERLEAVE_TWO_INPUT;
+
+    attr.prog_loop_instr_num = spLoopInstsNum;
+    attr.ignored_leading_outputs = 0;
+    attr.flush_cycle_num = 0;
+    attr.ignored_leading_v11_rd = 0;
+    attr.ignored_leading_v11_wr = 0;
+    attr.ignored_leading_v12_rd = 0;
+    attr.ignored_leading_v12_wr = 0;
+    attr.v11_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
+    attr.v12_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
+    attr.ch0_post_redistribute = VSI_NN_SP_CH_POST_REDISTRIBUTE_VECTOR_GATHER;
+    attr.ch1_post_redistribute = VSI_NN_SP_CH_POST_REDISTRIBUTE_VECTOR_GATHER;
+
+    attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_YZ;
+    attr.split_max_vector_depth = max_vector_depth;
+
+    spinst = vsi_nn_create_spinst(graph);
+    CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
+    status  = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
+    status |= vsi_nn_set_spinst_attr(spinst, attr);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    inputs_tensor[0] = weight->t;
+    inputs_tensor[1] = bias->t;
+    outputs_tensor[0] = dummy_output->t;
+
+    node = vxStreamProcessorNode(
+        graph->g,
+        inputs_tensor,
+        input_count,
+        outputs_tensor,
+        output_count,
+        spinst->sp,
+        NULL);
+
+final:
+    if (spinst)
+    {
+        vsi_nn_release_spinst(&spinst);
+    }
+
+    return (vsi_nn_kernel_node_t)node;
+}
+
+vsi_nn_kernel_node_t vsi_nn_sp_in_times_v11_plus_v12_node
+    (
+        vsi_nn_graph_t              * graph,
+        vsi_nn_tensor_t             * input,
+        vsi_nn_tensor_t             * dummy_tensor,
+        vsi_nn_tensor_t             * output
+    )
+{
+    const int32_t spLoopInstsNum = 1;
+    const int32_t spInstsNum = spLoopInstsNum;
+
+    const uint32_t input_count = 2;
+    const uint32_t output_count = 1;
+    vx_tensor inputs_tensor[3] = {NULL};
+    vx_tensor outputs_tensor[1] = {NULL};
+    vx_node node = NULL;
+    int32_t max_vector_depth = graph->ctx->config.sp_vector_depth /
+        graph->ctx->config.sp_exec_count;
+
+    vsi_nn_spinst_t *spinst = NULL;
+    vsi_nn_spinst_inst_param sp_insts_param[1];
+    vsi_nn_spinst_attr_t attr;
+
+    vsi_status status = VSI_FAILURE;
+
+    memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
+    vsi_nn_init_spinst_attr(&attr);
+
+    /* loop inst0: r1 = in * v11 || out = r1 + v12 */
+    status  = vsi_nn_sp_mul(&sp_insts_param[0], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR1);
+    status |= vsi_nn_sp_add(&sp_insts_param[0], VSI_NN_SP_SR1, VSI_NN_SP_VR12, VSI_NN_SP_SROUT);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_YZMERGE;
+
+    attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
+    attr.prog_loop_instr_num = spLoopInstsNum;
+    attr.ignored_leading_outputs = 3;
+    attr.ignored_leading_v11_rd = 0;
+    attr.ignored_leading_v12_rd = 3;
+    attr.flush_cycle_num = 3;
+    attr.v11_push_pop_config = VSI_NN_SP_PUSH_POP_EVERY_ROW;
+    attr.v12_push_pop_config = VSI_NN_SP_PUSH_POP_EVERY_ROW;
+
+    attr.num_of_v11_rd_in_flush_cycle = 0;
+    attr.num_of_v12_rd_in_flush_cycle = 3;
+
+    attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_YZ;
+    attr.split_max_vector_depth = max_vector_depth;
+
+    spinst = vsi_nn_create_spinst(graph);
+    CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
+    status  = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
+    status |= vsi_nn_set_spinst_attr(spinst, attr);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    inputs_tensor[0] = input->t;
+    inputs_tensor[1] = dummy_tensor->t;
+    outputs_tensor[0] = output->t;
+    node = vxStreamProcessorNode(
+        graph->g,
+        inputs_tensor,
+        input_count,
+        outputs_tensor,
+        output_count,
+        spinst->sp,
+        NULL);
+
+final:
+    if (spinst)
+    {
+        vsi_nn_release_spinst(&spinst);
+    }
+
+    return (vsi_nn_kernel_node_t)node;
+}
+
+/*
+** This program requires sum operation in the Y dimension.
+** Instead of using the SUM Engine, the sum needs to be performed
+** by Stream Processor instructions.
+*/
+vsi_nn_kernel_node_t layer_norm_y_direction
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    vsi_nn_tensor_t            ** outputs,
+    const vsi_nn_kernel_param_t * params
+    )
+{
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_tensor_attr_t attr;
+    vsi_nn_tensor_t * dummy_tensor[3] = {NULL};
+    vsi_nn_tensor_t * output_tensor[2] = {NULL};
+    int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
+    float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
+    float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
+    float inv_m = 1.0f / (float)(outputs[0]->attr.size[0]);
+    float s = inv_m * inv_m;
+    float const_a = (float)(outputs[0]->attr.size[0]);
+
+    memcpy( &attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t) );
+    attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+    attr.is_const = FALSE;
+    attr.vtl = TRUE;
+    attr.is_dummy = TRUE;
+    attr.size[axis] = 1;
+    dummy_tensor[0] = vsi_nn_CreateTensor( graph, &attr );
+    CHECK_PTR_FAIL_GOTO( dummy_tensor[0], "Create dummy_tensor fail.", final );
+    dummy_tensor[1] = vsi_nn_CreateTensor( graph, &attr );
+    CHECK_PTR_FAIL_GOTO( dummy_tensor[1], "Create dummy_tensor fail.", final );
+    memcpy( &attr.size, &inputs[2]->attr.size, sizeof(inputs[2]->attr.size) );
+    attr.dim_num = inputs[2]->attr.dim_num;
+    dummy_tensor[2] = vsi_nn_CreateTensor( graph, &attr );
+    CHECK_PTR_FAIL_GOTO( dummy_tensor[2], "Create dummy_tensor fail.", final );
+
+    memcpy( &attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t) );
+    attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+    attr.is_const = FALSE;
+    attr.vtl = TRUE;
+    output_tensor[0] = vsi_nn_CreateTensor( graph, &attr );
+    CHECK_PTR_FAIL_GOTO( output_tensor[0], "Create tensor fail.", final );
+    output_tensor[1] = vsi_nn_CreateTensor( graph, &attr );
+    CHECK_PTR_FAIL_GOTO( output_tensor[1], "Create tensor fail.", final );
+
+    node = vsi_nn_sp_moments_axis1_node(graph, inputs[0], output_tensor[0], dummy_tensor[0]);
+    CHECK_PTR_FAIL_GOTO( node, "Create sp_moments_axis1 fail.", final );
+    node = vsi_nn_sp_ln_means_axis1_node(graph, dummy_tensor[0], dummy_tensor[1],
+        inv_m, const_a, s, eps, output_scale);
+    CHECK_PTR_FAIL_GOTO( node, "Create ln_y_dirction_means  fail.", final );
+    node = vsi_nn_sp_layer_norm_axis1_node(graph, output_tensor[0], dummy_tensor[1], output_tensor[1]);
+    CHECK_PTR_FAIL_GOTO( node, "Create layer_norm_axis1 fail.", final );
+
+    node = vsi_nn_sp_load_weight_bias_node(graph, inputs[2], inputs[1], dummy_tensor[2]);
+    CHECK_PTR_FAIL_GOTO( node, "Create mov_weight_bias fail.", final );
+    node = vsi_nn_sp_in_times_v11_plus_v12_node(graph, output_tensor[1], dummy_tensor[2], outputs[0]);
+    CHECK_PTR_FAIL_GOTO( node, "Create in_times_v11_plus_v12 fail.", final );
+
+final:
+    vsi_safe_release_tensor(dummy_tensor[0]);
+    vsi_safe_release_tensor(dummy_tensor[1]);
+    vsi_safe_release_tensor(dummy_tensor[2]);
+    vsi_safe_release_tensor(output_tensor[0]);
+    vsi_safe_release_tensor(output_tensor[1]);
+
+    return node;
+} /* layer_norm_y_direction() */
+
+
+#endif
diff --git a/src/tim/vx/internal/src/kernel/sp/softmax_z_direction_sp.c b/src/tim/vx/internal/src/kernel/sp/softmax_z_direction_sp.c
new file mode 100644
index 0000000..cb550c2
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/sp/softmax_z_direction_sp.c
@@ -0,0 +1,938 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_sp_unit_operation.h"
+#include "kernel/vsi_nn_sp_lut.h"
+
+#if (VX_STREAM_PROCESSOR_SUPPORT)
+
+vsi_nn_spinst_t * vsi_nn_sp_max_axis2_inst
+    (
+        vx_context                context,
+        int32_t                   fifo_depth,
+        int32_t                   max_vector_depth
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    const int32_t spInitInstsNum = 4;
+    const int32_t spLoopInstsNum = fifo_depth > 4 ? 3 : 11;
+    const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum;
+    uint32_t f32_min = 0xff800000;
+    float clampMin = *(float*)&f32_min;
+    vsi_nn_spinst_t *spinst = NULL;
+    vsi_nn_spinst_inst_param sp_insts_param[15];
+    vsi_nn_spinst_attr_t attr;
+
+    memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
+    vsi_nn_init_spinst_attr(&attr);
+
+    /* init inst0: r2 = -INF */
+    status = vsi_nn_sp_move_constant(&sp_insts_param[0], clampMin, VSI_NN_SP_SR2);
+    /* init inst1: r10 = 0 */
+    status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 0, VSI_NN_SP_SR10);
+    /* init inst2: r4 = 1 */
+    status |= vsi_nn_sp_move_constant(&sp_insts_param[2], 1, VSI_NN_SP_SR4);
+    /* init inst3: nop */
+    status |= vsi_nn_sp_nop(&sp_insts_param[3]);
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+    if (fifo_depth > 4)
+    {
+        /* loop inst0: r1 = clamp(r3 * in, r6, r7) | r2 = v11 + r10 | r9 = r2 */
+        status  = vsi_nn_sp_mul_clamp(&sp_insts_param[4], VSI_NN_SP_SR3, VSI_NN_SP_SRIN, VSI_NN_SP_SR1);
+        status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_VR11, VSI_NN_SP_SR10, VSI_NN_SP_SR2);
+        status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_SR2, VSI_NN_SP_SR9);
+        /* loop inst1: r8 = r1 * r4 | r5 = r1 - r2 | v11 = r5 ? r8 : r9 */
+        status |= vsi_nn_sp_mul(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR3, VSI_NN_SP_SR8);
+        status |= vsi_nn_sp_sub(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR2, VSI_NN_SP_SR5);
+        status |= vsi_nn_sp_move_sel0(&sp_insts_param[5], VSI_NN_SP_SR5, VSI_NN_SP_SR8, VSI_NN_SP_VR11);
+        /* loop inst2: out = r1 */
+        status |= vsi_nn_sp_move(&sp_insts_param[6], VSI_NN_SP_SR1, VSI_NN_SP_SROUT);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+
+        attr.flush_cycle_num = 7;
+
+        attr.ignored_leading_outputs = 1;
+        attr.ignored_leading_v11_rd = fifo_depth;
+        attr.ignored_leading_v11_wr = 2;
+
+        attr.num_of_v11_rd_in_flush_cycle = 0;
+        attr.num_of_v11_wr_in_flush_cycle = 3;
+    }
+    else
+    {
+        /* loop inst0: r1 = clamp(r3 * in, r6, r7) | r2 = v11 + r10 | r9 = r2 */
+        status  = vsi_nn_sp_mul_clamp(&sp_insts_param[4], VSI_NN_SP_SR3, VSI_NN_SP_SRIN, VSI_NN_SP_SR1);
+        status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_VR11, VSI_NN_SP_SR10, VSI_NN_SP_SR2);
+        /* loop inst1: nop */
+        status |= vsi_nn_sp_nop(&sp_insts_param[5]);
+        /* loop inst2: nop */
+        status |= vsi_nn_sp_nop(&sp_insts_param[6]);
+        /* loop inst3: r8 = r1 * r4 | r5 = r1 - r2 | v11 = r5 ? r8 : r9 */
+        status |= vsi_nn_sp_mul(&sp_insts_param[7], VSI_NN_SP_SR1, VSI_NN_SP_SR3, VSI_NN_SP_SR8);
+        status |= vsi_nn_sp_sub(&sp_insts_param[7], VSI_NN_SP_SR1, VSI_NN_SP_SR2, VSI_NN_SP_SR5);
+        status |= vsi_nn_sp_move(&sp_insts_param[7], VSI_NN_SP_SR2, VSI_NN_SP_SR9);
+        /* loop inst4: out = r1 */
+        status |= vsi_nn_sp_move(&sp_insts_param[8], VSI_NN_SP_SR1, VSI_NN_SP_SROUT);
+        /* loop inst5: nop */
+        status |= vsi_nn_sp_nop(&sp_insts_param[9]);
+        /* loop inst6: nop */
+        status |= vsi_nn_sp_move_sel0(&sp_insts_param[10], VSI_NN_SP_SR5, VSI_NN_SP_SR8, VSI_NN_SP_VR11);
+        /* loop inst7: nop */
+        status |= vsi_nn_sp_nop(&sp_insts_param[11]);
+        /* loop inst8: nop */
+        status |= vsi_nn_sp_nop(&sp_insts_param[12]);
+        /* loop inst9: nop */
+        status |= vsi_nn_sp_nop(&sp_insts_param[13]);
+        /* loop inst10: nop */
+        status |= vsi_nn_sp_nop(&sp_insts_param[14]);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+
+        attr.ignored_leading_outputs = 0;
+        attr.ignored_leading_v11_rd = fifo_depth;
+        attr.ignored_leading_v11_wr = 0;
+
+        attr.num_of_v11_rd_in_flush_cycle = 0;
+        attr.num_of_v11_wr_in_flush_cycle = 1;
+
+        attr.flush_cycle_num = 10;
+    }
+
+    attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE;
+    attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
+
+    attr.prog_init_instr_num = spInitInstsNum;
+    attr.prog_loop_instr_num = spLoopInstsNum;
+    attr.v11_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
+
+    attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_XY;
+    attr.split_tilex_equal_imgx = TRUE;
+    attr.split_max_vector_depth = max_vector_depth;
+
+    spinst = vsi_nn_create_spinst_by_context(context);
+    CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
+    status  = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
+    status |= vsi_nn_set_spinst_attr(spinst, attr);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+final:
+    return spinst;
+}
+
+DEF_SP_KERNEL_QUERY(max_axis2_query)
+    (
+    vsi_nn_kernel_node_t        node
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vx_size index = 0;
+    vx_size tile_size[2] = {0};
+    vsi_nn_spinst_t *spinst = NULL;
+    int32_t fifo_depth = 0;
+    int32_t max_vector_depth = 0;
+    vx_context  ctx = vxGetContext((vx_reference)node);
+    vx_hardware_caps_params_ext2_t hw_param;
+
+    memset(&hw_param, 0, sizeof(vx_hardware_caps_params_ext2_t));
+    status = vxQueryHardwareCaps(ctx, (vx_hardware_caps_params_t*)(&hw_param), sizeof(vx_hardware_caps_params_ext2_t));
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+    status = vxQueryNode(node, VX_NODE_SWTILING_TILE_XY, tile_size, sizeof(tile_size));
+    CHECK_STATUS_FAIL_GOTO( status, final );
+    status = vxQueryNode(node, VX_NODE_SPINST_INDEX, &index, sizeof(index));
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+    fifo_depth = (int32_t)ceil((float)(tile_size[0] * tile_size[1]) / (float)hw_param.streamProcessorExecCount);
+    max_vector_depth = hw_param.streamProcessorVectorSize;
+
+    spinst = vsi_nn_sp_max_axis2_inst(ctx, fifo_depth, max_vector_depth);
+
+    status = vxSetParameterByIndex( node, (uint32_t)index, (vx_reference)spinst->sp );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+final:
+    if (spinst)
+    {
+        vsi_nn_release_spinst(&spinst);
+    }
+
+    return status;
+}
+
+vsi_nn_kernel_node_t vsi_nn_sp_max_axis2_node
+    (
+        vsi_nn_graph_t              * graph,
+        vsi_nn_tensor_t             * input,
+        vsi_nn_tensor_t             * output0,
+        vsi_nn_tensor_t             * output1
+    )
+{
+    const int32_t spInitInstsNum = 4;
+    const int32_t spLoopInstsNum = 3;
+    const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum;
+
+    const uint32_t input_count = 1;
+    const uint32_t output_count = 2;
+    vx_tensor inputs_tensor[1] = {NULL};
+    vx_tensor outputs_tensor[2] = {NULL};
+    vx_node node = NULL;
+
+    vsi_nn_spinst_t *spinst = NULL;
+    vsi_nn_spinst_inst_param sp_insts_param[7];
+    vsi_nn_spinst_attr_t attr;
+
+    vsi_status status = VSI_FAILURE;
+    int32_t max_vector_depth = graph->ctx->config.sp_vector_depth;
+    uint32_t f32_min = 0xff800000;
+    float flt_min = *(float*)&f32_min;
+    float input_scale = vsi_nn_get_tensor_scale(input);
+    float clamp_min = 0;
+    float clamp_max = 0;
+
+    memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
+    vsi_nn_init_spinst_attr(&attr);
+
+    vsi_nn_get_tensor_clamp_min_max(input, &clamp_min, &clamp_max);
+    clamp_min = clamp_min * input_scale;
+    clamp_max = clamp_max * input_scale;
+
+    /* init inst0: r2 = -INF */
+    status = vsi_nn_sp_move_constant(&sp_insts_param[0], flt_min, VSI_NN_SP_SR2);
+    /* init inst1: r10 = 0 */
+    status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 0, VSI_NN_SP_SR10);
+    /* init inst2: r4 = 1 */
+    status |= vsi_nn_sp_move_constant(&sp_insts_param[2], 1, VSI_NN_SP_SR4);
+    /* init inst3: nop */
+    status |= vsi_nn_sp_nop(&sp_insts_param[3]);
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+    /* loop inst0: r1 = clamp(r3 * in, r6, r7) | r2 = v11 + r10 | r9 = r2 */
+    status  = vsi_nn_sp_mul_clamp(&sp_insts_param[4], VSI_NN_SP_SR3, VSI_NN_SP_SRIN, VSI_NN_SP_SR1);
+    status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_VR11, VSI_NN_SP_SR10, VSI_NN_SP_SR2);
+    status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_SR2, VSI_NN_SP_SR9);
+    /* loop inst1: r8 = r1 * r4 | r5 = r1 - r2 | v11 = r5 ? r8 : r9 */
+    status |= vsi_nn_sp_mul(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR3, VSI_NN_SP_SR8);
+    status |= vsi_nn_sp_sub(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR2, VSI_NN_SP_SR5);
+    status |= vsi_nn_sp_move_sel0(&sp_insts_param[5], VSI_NN_SP_SR5, VSI_NN_SP_SR8, VSI_NN_SP_VR11);
+    /* loop inst2: out = r1 */
+    status |= vsi_nn_sp_move(&sp_insts_param[6], VSI_NN_SP_SR1, VSI_NN_SP_SROUT);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE;
+    attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
+
+    attr.prog_init_instr_num = spInitInstsNum;
+    attr.prog_loop_instr_num = spLoopInstsNum;
+    attr.v11_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
+
+    attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_XY;
+    attr.split_tilex_equal_imgx = TRUE;
+    attr.split_max_vector_depth = max_vector_depth;
+
+    attr.flush_cycle_num = 7;
+
+    attr.ignored_leading_outputs = 1;
+    attr.ignored_leading_v11_rd = 5;
+    attr.ignored_leading_v11_wr = 2;
+
+    attr.num_of_v11_rd_in_flush_cycle = 0;
+    attr.num_of_v11_wr_in_flush_cycle = 3;
+
+    VSI_NN_SP_ATTR_SET_CONST_TO_SR3(attr, input_scale);
+    VSI_NN_SP_ATTR_SET_CONST_TO_SR6(attr, clamp_max);
+    VSI_NN_SP_ATTR_SET_CONST_TO_SR7(attr, clamp_min);
+
+    attr.prog_init_instr_num = spInitInstsNum;
+    attr.prog_loop_instr_num = spLoopInstsNum;
+
+    spinst = vsi_nn_create_spinst(graph);
+    CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
+    status  = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
+    status |= vsi_nn_set_spinst_attr(spinst, attr);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    inputs_tensor[0] = input->t;
+    outputs_tensor[0] = output0->t;
+    outputs_tensor[1] = output1->t;
+    node = vxStreamProcessorNode(
+        graph->g,
+        inputs_tensor,
+        input_count,
+        outputs_tensor,
+        output_count,
+        spinst->sp,
+        NULL);
+
+final:
+
+    if (node)
+    {
+        vxAssignNodeQueryCallback(node, max_axis2_query);
+    }
+
+    if (spinst)
+    {
+        vsi_nn_release_spinst(&spinst);
+    }
+
+    return (vsi_nn_kernel_node_t)node;
+}
+
+vsi_nn_spinst_t * vsi_nn_sp_exp_y_direction_inst
+    (
+        vx_context                context,
+        int32_t                   fifo_depth,
+        int32_t                   max_vector_depth
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    const int32_t spInitInstsNum = 2;
+    const int32_t spLoopInstsNum = fifo_depth > 3 ? 4 : 8;
+    const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum;
+    vsi_nn_spinst_t *spinst = NULL;
+    vsi_nn_spinst_inst_param sp_insts_param[10];
+    vsi_nn_spinst_attr_t attr;
+
+    memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
+    vsi_nn_init_spinst_attr(&attr);
+
+    /* init inst0: r8 = 0 */
+    status  = vsi_nn_sp_move_constant(&sp_insts_param[0], 0, VSI_NN_SP_SR8);
+    /* init inst1: r9 = 1 */
+    status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 1, VSI_NN_SP_SR9);
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+    if (fifo_depth > 3)
+    {
+        /* loop inst0: r2 = in - v11 | v11 = v11 */
+        status  = vsi_nn_sp_sub(&sp_insts_param[2], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR2);
+        status |= vsi_nn_sp_move(&sp_insts_param[2], VSI_NN_SP_VR11, VSI_NN_SP_VR11);
+        /* loop inst1: r8 = v12 * r9 | r7 = r4 + r6 | out = r7 */
+        status |= vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_VR12, VSI_NN_SP_SR9, VSI_NN_SP_SR8);
+        status |= vsi_nn_sp_add(&sp_insts_param[3], VSI_NN_SP_SR4, VSI_NN_SP_SR6, VSI_NN_SP_SR7);
+        status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_SR7, VSI_NN_SP_SROUT);
+        /* loop inst2: r6 = r5 * r2 | v12 = r7 + r8 | r4 = r3 */
+        status |= vsi_nn_sp_mul(&sp_insts_param[4], VSI_NN_SP_SR5, VSI_NN_SP_SR2, VSI_NN_SP_SR6);
+        status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_SR7, VSI_NN_SP_SR8, VSI_NN_SP_VR12);
+        status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_SR3, VSI_NN_SP_SR4);
+        /* loop inst3: r1 = setup(r2) | r5 = pwlMul * pwlMul | r2 = pwlAdd + pwlAdd | r1 = r3*/
+        status |= vsi_nn_sp_pwl_setup0(&sp_insts_param[5], VSI_NN_SP_SR2, VSI_NN_SP_SR1);
+        status |= vsi_nn_sp_mul(&sp_insts_param[5], VSI_NN_SP_PWLMUL, VSI_NN_SP_PWLMUL, VSI_NN_SP_SR5);
+        status |= vsi_nn_sp_sub(&sp_insts_param[5], VSI_NN_SP_PWLADD, VSI_NN_SP_PWLADD, VSI_NN_SP_SR2);
+        status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR3);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+
+        attr.flush_cycle_num = 18;
+
+        attr.ignored_leading_outputs = 4;
+        attr.ignored_leading_v11_rd = 0;
+        attr.ignored_leading_v11_wr = 0;
+        attr.ignored_leading_v12_rd = fifo_depth + 3;
+        attr.ignored_leading_v12_wr = 4;
+
+        attr.num_of_v12_rd_in_flush_cycle = 4;
+        attr.num_of_v12_wr_in_flush_cycle = 5;
+    }
+    else
+    {
+        /* loop inst0: r2 = in - v11 | v11 = v11 */
+        status  = vsi_nn_sp_sub(&sp_insts_param[2], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR2);
+        status |= vsi_nn_sp_move(&sp_insts_param[2], VSI_NN_SP_VR11, VSI_NN_SP_VR11);
+        /* loop inst1: r6 = r5 * r2 | r4 = r3 */
+        status |= vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_SR5, VSI_NN_SP_SR2, VSI_NN_SP_SR6);
+        status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_SR3, VSI_NN_SP_SR4);
+        /* loop inst2: nop */
+        status |= vsi_nn_sp_nop(&sp_insts_param[4]);
+        /* loop inst3: r1 = setup(r2) */
+        status  = vsi_nn_sp_pwl_setup0(&sp_insts_param[5], VSI_NN_SP_SR2, VSI_NN_SP_SR1);
+        /* loop inst4: r8 = v12 * r9 | r7 = r4 + r6 */
+        status |= vsi_nn_sp_mul(&sp_insts_param[6], VSI_NN_SP_VR12, VSI_NN_SP_SR9, VSI_NN_SP_SR8);
+        status |= vsi_nn_sp_add(&sp_insts_param[6], VSI_NN_SP_SR4, VSI_NN_SP_SR6, VSI_NN_SP_SR7);
+        /* loop inst5: nop */
+        status |= vsi_nn_sp_nop(&sp_insts_param[7]);
+        /* loop inst6: r5 = pwlMul * pwlMul | r2 = pwlAdd + pwlAdd | r1 = r3*/
+        status |= vsi_nn_sp_mul(&sp_insts_param[8], VSI_NN_SP_PWLMUL, VSI_NN_SP_PWLMUL, VSI_NN_SP_SR5);
+        status |= vsi_nn_sp_sub(&sp_insts_param[8], VSI_NN_SP_PWLADD, VSI_NN_SP_PWLADD, VSI_NN_SP_SR2);
+        status |= vsi_nn_sp_move(&sp_insts_param[8], VSI_NN_SP_SR1, VSI_NN_SP_SR3);
+        /* loop inst7: v12 = r7 + r8 | out = r7 */
+        status |= vsi_nn_sp_add(&sp_insts_param[9], VSI_NN_SP_SR7, VSI_NN_SP_SR8, VSI_NN_SP_VR12);
+        status |= vsi_nn_sp_move(&sp_insts_param[9], VSI_NN_SP_SR7, VSI_NN_SP_SROUT);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+
+        attr.ignored_leading_outputs = 1;
+        attr.ignored_leading_v11_rd = 0;
+        attr.ignored_leading_v11_wr = 0;
+        attr.ignored_leading_v12_rd = fifo_depth + 1;
+        attr.ignored_leading_v12_wr = 1;
+
+        attr.num_of_v12_rd_in_flush_cycle = 2;
+        attr.num_of_v12_wr_in_flush_cycle = 2;
+
+        attr.flush_cycle_num = 15;
+    }
+
+    attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE;
+    attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
+
+    attr.prog_init_instr_num = spInitInstsNum;
+    attr.prog_loop_instr_num = spLoopInstsNum;
+    attr.v12_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
+
+    attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_XY;
+    attr.split_tilex_equal_imgx = TRUE;
+    attr.split_max_vector_depth = max_vector_depth;
+
+    spinst = vsi_nn_create_spinst_by_context(context);
+    CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
+    status  = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
+    status |= vsi_nn_set_spinst_attr(spinst, attr);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+final:
+    return spinst;
+}
+
+DEF_SP_KERNEL_QUERY(softmax_z_direction_exp_query)
+    (
+    vsi_nn_kernel_node_t        node
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vx_size index = 0;
+    vx_size tile_size[2] = {0};
+    vsi_nn_spinst_t *spinst = NULL;
+    int32_t fifo_depth = 0;
+    int32_t max_vector_depth = 0;
+    vx_context  ctx = vxGetContext((vx_reference)node);
+    vx_hardware_caps_params_ext2_t hw_param;
+
+    memset(&hw_param, 0, sizeof(vx_hardware_caps_params_ext2_t));
+    status = vxQueryHardwareCaps(ctx, (vx_hardware_caps_params_t*)(&hw_param), sizeof(vx_hardware_caps_params_ext2_t));
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+    status = vxQueryNode(node, VX_NODE_SWTILING_TILE_XY, tile_size, sizeof(tile_size));
+    CHECK_STATUS_FAIL_GOTO( status, final );
+    status = vxQueryNode(node, VX_NODE_SPINST_INDEX, &index, sizeof(index));
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+    fifo_depth = (int32_t)ceil((float)(tile_size[0] * tile_size[1])/ (float)hw_param.streamProcessorExecCount);
+    max_vector_depth = hw_param.streamProcessorVectorSize;
+
+    spinst = vsi_nn_sp_exp_y_direction_inst(ctx, fifo_depth, max_vector_depth);
+
+    status = vxSetParameterByIndex( node, (uint32_t)index, (vx_reference)spinst->sp );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+final:
+    if (spinst)
+    {
+        vsi_nn_release_spinst(&spinst);
+    }
+
+    return status;
+}
+
+vsi_nn_kernel_node_t vsi_nn_sp_softmax_z_direction_exp_node
+    (
+        vsi_nn_graph_t              * graph,
+        vsi_nn_tensor_t             * input0,
+        vsi_nn_tensor_t             * input1,
+        vsi_nn_tensor_t             * output0,
+        vsi_nn_tensor_t             * output1,
+        float                         beta
+    )
+{
+    const int32_t spInitInstsNum = 2;
+    const int32_t spLoopInstsNum = 4;
+    const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum;
+
+    const uint32_t input_count = 2;
+    const uint32_t output_count = 2;
+    vx_tensor inputs_tensor[2] = {NULL};
+    vx_tensor outputs_tensor[2] = {NULL};
+    vx_node node = NULL;
+
+    vsi_nn_spinst_t *spinst = NULL;
+    vsi_nn_spinst_inst_param sp_insts_param[6];
+    vsi_nn_spinst_attr_t attr;
+
+    vsi_nn_sp_lut_params sp_lut_params;
+    vx_lut_params_s vx_lut_params;
+
+    vsi_status status = VSI_FAILURE;
+    int32_t fifo_depth = 4;
+    int32_t max_vector_depth = graph->ctx->config.sp_vector_depth;
+
+    memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
+    vsi_nn_init_spinst_attr(&attr);
+    memset(&sp_lut_params, 0, sizeof(vsi_nn_sp_lut_params));
+    memset(&vx_lut_params, 0, sizeof(vx_lut_params_s));
+
+    /* init inst0: r8 = 0 */
+    status  = vsi_nn_sp_move_constant(&sp_insts_param[0], 0, VSI_NN_SP_SR8);
+    /* init inst1: r9 = 1 */
+    status |= vsi_nn_sp_move_constant(&sp_insts_param[1], 1, VSI_NN_SP_SR9);
+    CHECK_STATUS_FAIL_GOTO(status, final);
+
+    /* loop inst0: r2 = in - v11 | v11 = v11 */
+    status  = vsi_nn_sp_sub(&sp_insts_param[2], VSI_NN_SP_SRIN, VSI_NN_SP_VR11, VSI_NN_SP_SR2);
+    status |= vsi_nn_sp_move(&sp_insts_param[2], VSI_NN_SP_VR11, VSI_NN_SP_VR11);
+    /* loop inst1: r8 = v12 * r9 | r7 = r4 + r6 | out = r7 */
+    status |= vsi_nn_sp_mul(&sp_insts_param[3], VSI_NN_SP_VR12, VSI_NN_SP_SR9, VSI_NN_SP_SR8);
+    status |= vsi_nn_sp_add(&sp_insts_param[3], VSI_NN_SP_SR4, VSI_NN_SP_SR6, VSI_NN_SP_SR7);
+    status |= vsi_nn_sp_move(&sp_insts_param[3], VSI_NN_SP_SR7, VSI_NN_SP_SROUT);
+    /* loop inst2: r6 = r5 * r2 | v12 = r7 + r8 | r4 = r3 */
+    status |= vsi_nn_sp_mul(&sp_insts_param[4], VSI_NN_SP_SR5, VSI_NN_SP_SR2, VSI_NN_SP_SR6);
+    status |= vsi_nn_sp_add(&sp_insts_param[4], VSI_NN_SP_SR7, VSI_NN_SP_SR8, VSI_NN_SP_VR12);
+    status |= vsi_nn_sp_move(&sp_insts_param[4], VSI_NN_SP_SR3, VSI_NN_SP_SR4);
+    /* loop inst3: r1 = setup(r2) | r5 = pwlMul * pwlMul | r2 = pwlAdd + pwlAdd | r1 = r3*/
+    status |= vsi_nn_sp_pwl_setup0(&sp_insts_param[5], VSI_NN_SP_SR2, VSI_NN_SP_SR1);
+    status |= vsi_nn_sp_mul(&sp_insts_param[5], VSI_NN_SP_PWLMUL, VSI_NN_SP_PWLMUL, VSI_NN_SP_SR5);
+    status |= vsi_nn_sp_sub(&sp_insts_param[5], VSI_NN_SP_PWLADD, VSI_NN_SP_PWLADD, VSI_NN_SP_SR2);
+    status |= vsi_nn_sp_move(&sp_insts_param[5], VSI_NN_SP_SR1, VSI_NN_SP_SR3);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    attr.flush_cycle_num = 18;
+
+    attr.ignored_leading_outputs = 4;
+    attr.ignored_leading_v11_rd = 0;
+    attr.ignored_leading_v11_wr = 0;
+    attr.ignored_leading_v12_rd = fifo_depth + 3;
+    attr.ignored_leading_v12_wr = 4;
+
+    attr.num_of_v12_rd_in_flush_cycle = 4;
+    attr.num_of_v12_wr_in_flush_cycle = 5;
+
+    attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE;
+    attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
+
+    attr.prog_init_instr_num = spInitInstsNum;
+    attr.prog_loop_instr_num = spLoopInstsNum;
+    attr.v12_reset_at_start = VSI_NN_SP_V_RESET_AT_START_RESET;
+
+    attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_XY;
+    attr.split_tilex_equal_imgx = TRUE;
+    attr.split_max_vector_depth = max_vector_depth;
+
+    attr.prog_init_instr_num = spInitInstsNum;
+    attr.prog_loop_instr_num = spLoopInstsNum;
+
+    spinst = vsi_nn_create_spinst(graph);
+    CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
+    status  = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
+    status |= vsi_nn_set_spinst_attr(spinst, attr);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    vx_lut_params.lut_function = VX_NN_ACTIVATION_CUSTOM;
+    vx_lut_params.in_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE);
+    vx_lut_params.out_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE);
+
+    sp_lut_params.act_type = VSI_NN_SP_ACT_LINEAR_EXP;
+    sp_lut_params.params[0] = beta;
+    sp_lut_params.params[1] = 0;
+    vsi_nn_sp_lut(vx_lut_params.in_lut, vx_lut_params.out_lut, &sp_lut_params);
+
+    inputs_tensor[0] = input0->t;
+    inputs_tensor[1] = input1->t;
+    outputs_tensor[0] = output0->t;
+    outputs_tensor[1] = output1->t;
+    node = vxStreamProcessorNode(
+        graph->g,
+        inputs_tensor,
+        input_count,
+        outputs_tensor,
+        output_count,
+        spinst->sp,
+        &vx_lut_params);
+
+final:
+    if (node)
+    {
+        vxAssignNodeQueryCallback(node, softmax_z_direction_exp_query);
+    }
+
+    if (spinst)
+    {
+        vsi_nn_release_spinst(&spinst);
+    }
+
+    if (vx_lut_params.in_lut)
+    {
+        vxReleaseLUT(&vx_lut_params.in_lut);
+        vx_lut_params.in_lut = NULL;
+    }
+
+    if (vx_lut_params.out_lut)
+    {
+        vxReleaseLUT(&vx_lut_params.out_lut);
+        vx_lut_params.out_lut = NULL;
+    }
+
+    return (vsi_nn_kernel_node_t)node;
+}
+vsi_nn_kernel_node_t vsi_nn_sp_rcp_node
+    (
+        vsi_nn_graph_t  * graph,
+        vsi_nn_tensor_t * input,
+        vsi_nn_tensor_t * output,
+        float             output_scale
+    )
+{
+    const int32_t spLoopInstsNum = 3;
+    const int32_t spInstsNum = spLoopInstsNum;
+
+    const uint32_t input_count = 1;
+    const uint32_t output_count = 1;
+    vx_tensor inputs_tensor[1] = {NULL};
+    vx_tensor outputs_tensor[1] = {NULL};
+    vx_node node = NULL;
+    int32_t max_vector_depth = graph->ctx->config.sp_vector_depth;
+
+    vsi_nn_spinst_t *spinst = NULL;
+    vsi_nn_spinst_inst_param sp_insts_param[3];
+    vsi_nn_spinst_attr_t attr;
+
+    vsi_nn_sp_lut_params sp_lut_params;
+    vx_lut_params_s vx_lut_params;
+
+    vsi_status status = VSI_FAILURE;
+
+    memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
+    vsi_nn_init_spinst_attr(&attr);
+    memset(&sp_lut_params, 0, sizeof(vsi_nn_sp_lut_params));
+    memset(&vx_lut_params, 0, sizeof(vx_lut_params_s));
+
+    /* loop inst0: r1 = pwlSetup(v12) | r5 = pwlMul() | r2 = pwlAdd() | r8 = r1 */
+    status  = vsi_nn_sp_pwl_setup0(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_SR1);
+    status |= vsi_nn_sp_mul(&sp_insts_param[0], VSI_NN_SP_PWLMUL, VSI_NN_SP_PWLMUL, VSI_NN_SP_SR5);
+    status |= vsi_nn_sp_sub(&sp_insts_param[0], VSI_NN_SP_PWLADD, VSI_NN_SP_PWLADD, VSI_NN_SP_SR2);
+    status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_SR1, VSI_NN_SP_SR8);
+    /* loop inst1: r6 = r5 * r2 | r7 = r4 + r6 | r4 = r8 */
+    status |= vsi_nn_sp_mul(&sp_insts_param[1], VSI_NN_SP_SR5, VSI_NN_SP_SR2, VSI_NN_SP_SR6);
+    status |= vsi_nn_sp_add(&sp_insts_param[1], VSI_NN_SP_SR4, VSI_NN_SP_SR6, VSI_NN_SP_SR7);
+    status |= vsi_nn_sp_move(&sp_insts_param[1], VSI_NN_SP_SR4, VSI_NN_SP_SR8);
+    /* loop inst1: v12 = r7 * r3 */
+    status |= vsi_nn_sp_mul(&sp_insts_param[2], VSI_NN_SP_SR7, VSI_NN_SP_SR3, VSI_NN_SP_VR12);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE;
+
+    attr.input_setup = VSI_NN_SP_INPUT_SETUP_V12;
+    attr.prog_loop_instr_num = spLoopInstsNum;
+    attr.ignored_leading_v12_wr = 4;
+    attr.ignored_leading_v12_rd = 0;
+    attr.flush_cycle_num = 14;
+
+    attr.num_of_v12_wr_in_flush_cycle = 5;
+
+    attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_YZ;
+    attr.split_max_vector_depth = max_vector_depth;
+
+    VSI_NN_SP_ATTR_SET_CONST_TO_SR3(attr, 1.0f / output_scale);
+
+    spinst = vsi_nn_create_spinst(graph);
+    CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
+    status  = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
+    status |= vsi_nn_set_spinst_attr(spinst, attr);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    inputs_tensor[0] = input->t;
+    outputs_tensor[0] = output->t;
+
+    vx_lut_params.lut_function = VX_NN_ACTIVATION_CUSTOM;
+    vx_lut_params.in_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE);
+    vx_lut_params.out_lut = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_SP_LUT_MAX_SIZE);
+
+    sp_lut_params.act_type = VSI_NN_SP_ACT_RCP;
+    vsi_nn_sp_lut(vx_lut_params.in_lut, vx_lut_params.out_lut, &sp_lut_params);
+
+    node = vxStreamProcessorNode(
+        graph->g,
+        inputs_tensor,
+        input_count,
+        outputs_tensor,
+        output_count,
+        spinst->sp,
+        &vx_lut_params);
+
+final:
+    if (spinst)
+    {
+        vsi_nn_release_spinst(&spinst);
+    }
+
+    if (vx_lut_params.in_lut)
+    {
+        vxReleaseLUT(&vx_lut_params.in_lut);
+        vx_lut_params.in_lut = NULL;
+    }
+    if (vx_lut_params.out_lut)
+    {
+        vxReleaseLUT(&vx_lut_params.out_lut);
+        vx_lut_params.out_lut = NULL;
+    }
+
+    return (vsi_nn_kernel_node_t)node;
+}
+
+vsi_nn_spinst_t * vsi_nn_sp_times_inst
+    (
+        vx_context                context,
+        int32_t                   fifo_depth,
+        int32_t                   max_vector_depth
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    const int32_t spInitInstsNum = 0;
+    const int32_t spLoopInstsNum = fifo_depth > 4 ? 1 : fifo_depth > 1 ? 3 : 5;
+    const int32_t spInstsNum = spInitInstsNum + spLoopInstsNum;
+    vsi_nn_spinst_t *spinst = NULL;
+    vsi_nn_spinst_inst_param sp_insts_param[5];
+    vsi_nn_spinst_attr_t attr;
+
+    memset(sp_insts_param, 0, sizeof(vsi_nn_spinst_inst_param) * spInstsNum);
+    vsi_nn_init_spinst_attr(&attr);
+
+    if (fifo_depth > 4)
+    {
+        /* loop inst0: out = v12 * in | v12 = v12 */
+        status  = vsi_nn_sp_mul(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_SRIN, VSI_NN_SP_SROUT);
+        status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_VR12);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    else if (fifo_depth > 1)
+    {
+        /* loop inst0: out = v12 * in | v12 = v12 */
+        status  = vsi_nn_sp_mul(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_SRIN, VSI_NN_SP_SROUT);
+        status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_VR12);
+        /* loop inst1: nop */
+        status |= vsi_nn_sp_nop(&sp_insts_param[1]);
+        /* loop inst2: nop */
+        status |= vsi_nn_sp_nop(&sp_insts_param[2]);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    else
+    {
+        /* loop inst0: out = v12 * in | v12 = v12 */
+        status  = vsi_nn_sp_mul(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_SRIN, VSI_NN_SP_SROUT);
+        status |= vsi_nn_sp_move(&sp_insts_param[0], VSI_NN_SP_VR12, VSI_NN_SP_VR12);
+        /* loop inst1: nop */
+        status |= vsi_nn_sp_nop(&sp_insts_param[1]);
+        /* loop inst2: nop */
+        status |= vsi_nn_sp_nop(&sp_insts_param[2]);
+        /* loop inst3: nop */
+        status |= vsi_nn_sp_nop(&sp_insts_param[3]);
+        /* loop inst4: nop */
+        status |= vsi_nn_sp_nop(&sp_insts_param[4]);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+
+    attr.input_tile_mapping = VSI_NN_SP_ATTR_INPUT_TILE_MAPPING_XYMERGE;
+    attr.input_setup = VSI_NN_SP_INPUT_SETUP_SINGLE_INPUT;
+
+    attr.prog_init_instr_num = spInitInstsNum;
+    attr.prog_loop_instr_num = spLoopInstsNum;
+
+    attr.flush_cycle_num = 0;
+
+    attr.ignored_leading_outputs = 0;
+    attr.ignored_leading_v11_rd = 0;
+    attr.ignored_leading_v11_wr = 0;
+
+    attr.num_of_v11_rd_in_flush_cycle = 0;
+    attr.num_of_v11_wr_in_flush_cycle = 0;
+
+    attr.split_axis = VSI_SP_ATTR_SPLIT_ON_AXIS_XY;
+    attr.split_tilex_equal_imgx = TRUE;
+    attr.split_max_vector_depth = max_vector_depth;
+
+    spinst = vsi_nn_create_spinst_by_context(context);
+    CHECK_PTR_FAIL_GOTO( spinst, "Create spInst fail.", final );
+    status  = vsi_nn_add_spinst_insts(spinst, sp_insts_param, spInstsNum);
+    status |= vsi_nn_set_spinst_attr(spinst, attr);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+final:
+    return spinst;
+}
+
+DEF_SP_KERNEL_QUERY(times_query)
+    (
+    vsi_nn_kernel_node_t        node
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vx_size index = 0;
+    vx_size tile_size[2] = {0};
+    vsi_nn_spinst_t *spinst = NULL;
+    int32_t fifo_depth = 0;
+    int32_t max_vector_depth = 0;
+    vx_context  ctx = vxGetContext((vx_reference)node);
+    vx_hardware_caps_params_ext2_t hw_param;
+
+    memset(&hw_param, 0, sizeof(vx_hardware_caps_params_ext2_t));
+    status = vxQueryHardwareCaps(ctx, (vx_hardware_caps_params_t*)(&hw_param), sizeof(vx_hardware_caps_params_ext2_t));
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+    status = vxQueryNode(node, VX_NODE_SWTILING_TILE_XY, tile_size, sizeof(tile_size));
+    CHECK_STATUS_FAIL_GOTO( status, final );
+    status = vxQueryNode(node, VX_NODE_SPINST_INDEX, &index, sizeof(index));
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+    fifo_depth = (int32_t)ceil((float)(tile_size[0] * tile_size[1]) / (float)hw_param.streamProcessorExecCount);
+    max_vector_depth = hw_param.streamProcessorVectorSize;
+
+    spinst = vsi_nn_sp_times_inst(ctx, fifo_depth, max_vector_depth);
+
+    status = vxSetParameterByIndex( node, (uint32_t)index, (vx_reference)spinst->sp );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+final:
+    if (spinst)
+    {
+        vsi_nn_release_spinst(&spinst);
+    }
+
+    return status;
+}
+
+vsi_nn_kernel_node_t vsi_nn_sp_softmax_z_direction_times_node
+    (
+        vsi_nn_graph_t              * graph,
+        vsi_nn_tensor_t             * input0,
+        vsi_nn_tensor_t             * input1,
+        vsi_nn_tensor_t             * output
+    )
+{
+    const uint32_t input_count = 2;
+    const uint32_t output_count = 1;
+    vx_tensor inputs_tensor[2] = {NULL, NULL};
+    vx_tensor outputs_tensor[1] = {NULL};
+    vx_node node = NULL;
+    int32_t max_vector_depth = graph->ctx->config.sp_vector_depth;
+    int32_t fifo_depth = 5;
+
+    vsi_nn_spinst_t *spinst = NULL;
+
+    spinst = vsi_nn_sp_times_inst(graph->ctx->c, fifo_depth, max_vector_depth);
+
+    inputs_tensor[0] = input0->t;
+    inputs_tensor[1] = input1->t;
+    outputs_tensor[0] = output->t;
+    node = vxStreamProcessorNode(
+        graph->g,
+        inputs_tensor,
+        input_count,
+        outputs_tensor,
+        output_count,
+        spinst->sp,
+        NULL);
+
+    if (node)
+    {
+        vxAssignNodeQueryCallback(node, times_query);
+    }
+
+    if (spinst)
+    {
+        vsi_nn_release_spinst(&spinst);
+    }
+
+    return (vsi_nn_kernel_node_t)node;
+}
+
+/*
+** This program requires sum operation in the z dimension.
+** Instead of using the SUM Engine, the sum needs to be performed
+** by Stream Processor instructions.
+*/
+vsi_nn_kernel_node_t softmax_z_direction
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    vsi_nn_tensor_t            ** outputs,
+    const vsi_nn_kernel_param_t * params
+    )
+{
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_nn_tensor_attr_t attr;
+    vsi_nn_tensor_t * dummy_tensor[3] = {NULL};
+    vsi_nn_tensor_t * output_tensor[2] = {NULL};
+    int32_t axis = 2;
+    float beta = vsi_nn_kernel_param_get_float32( params, "beta" );
+    float output_scale = vsi_nn_get_tensor_scale(outputs[0]);
+
+    memcpy( &attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t) );
+    attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+    attr.is_const = FALSE;
+    attr.vtl = TRUE;
+    attr.is_dummy = TRUE;
+    attr.size[axis] = 1;
+    dummy_tensor[0] = vsi_nn_CreateTensor( graph, &attr );
+    CHECK_PTR_FAIL_GOTO( dummy_tensor[0], "Create dummy_tensor fail.", final );
+    dummy_tensor[1] = vsi_nn_CreateTensor( graph, &attr );
+    CHECK_PTR_FAIL_GOTO( dummy_tensor[1], "Create dummy_tensor fail.", final );
+    dummy_tensor[2] = vsi_nn_CreateTensor( graph, &attr );
+    CHECK_PTR_FAIL_GOTO( dummy_tensor[2], "Create dummy_tensor fail.", final );
+
+    memcpy( &attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t) );
+    attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+    attr.is_const = FALSE;
+    attr.vtl = TRUE;
+    output_tensor[0] = vsi_nn_CreateTensor( graph, &attr );
+    CHECK_PTR_FAIL_GOTO( output_tensor[0], "Create tensor fail.", final );
+    output_tensor[1] = vsi_nn_CreateTensor( graph, &attr );
+    CHECK_PTR_FAIL_GOTO( output_tensor[1], "Create tensor fail.", final );
+
+    node = vsi_nn_sp_max_axis2_node(graph, inputs[0], output_tensor[0], dummy_tensor[0]);
+    CHECK_PTR_FAIL_GOTO( node, "Create sp_max_axis2 fail.", final );
+    node = vsi_nn_sp_softmax_z_direction_exp_node(graph, output_tensor[0], dummy_tensor[0],
+        output_tensor[1], dummy_tensor[1], beta);
+    CHECK_PTR_FAIL_GOTO( node, "Create exp_y_direction fail.", final );
+    node = vsi_nn_sp_rcp_node(graph, dummy_tensor[1], dummy_tensor[2], output_scale);
+    CHECK_PTR_FAIL_GOTO( node, "Create sp_rcp fail.", final );
+    node = vsi_nn_sp_softmax_z_direction_times_node(graph, output_tensor[1], dummy_tensor[2], outputs[0]);
+    CHECK_PTR_FAIL_GOTO( node, "Create softmax_times fail.", final );
+
+final:
+    vsi_safe_release_tensor(dummy_tensor[0]);
+    vsi_safe_release_tensor(dummy_tensor[1]);
+    vsi_safe_release_tensor(dummy_tensor[2]);
+    vsi_safe_release_tensor(output_tensor[0]);
+    vsi_safe_release_tensor(output_tensor[1]);
+
+    return node;
+} /* softmax_z_direction() */
+
+#endif
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
index e9a9272..aa47362 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
@@ -35,6 +35,7 @@
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_math.h"
 #include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_dtype_util.h"
 
 #include "libnnext/vsi_nn_libnnext_resource.h"
 #if VSI_USE_VXC_BINARY
@@ -118,7 +119,14 @@ static void _kernel_clear_source
 
 static vsi_bool _check_shader_support(vsi_nn_graph_t* graph);
 
-static vsi_bool vsi_nn_kernel_is_asymmtric_int8
+static vsi_bool _check_stream_process_support
+    (
+    vsi_nn_graph_t* graph,
+    vsi_nn_tensor_t** inputs,
+    size_t input_num
+    );
+
+vsi_bool vsi_nn_kernel_is_supported_types
     (
     vsi_nn_tensor_t** inputs,
     size_t input_num,
@@ -1222,7 +1230,7 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
             /* Skip evis and cl when disable shader */
             if ( (type == VSI_NN_KERNEL_TYPE_EVIS || type == VSI_NN_KERNEL_TYPE_CL)
                 && ( _check_shader_support(graph) == FALSE ||
-                vsi_nn_kernel_is_asymmtric_int8(inputs, input_num, outputs, output_num) ) )
+                vsi_nn_kernel_is_supported_types(inputs, input_num, outputs, output_num) == FALSE ) )
             {
                 continue;
             }
@@ -1234,8 +1242,8 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
             }
 
             /* Skip StreamProcesor if not support */
-            if( type == VSI_NN_KERNEL_TYPE_SP
-                && !graph->ctx->config.support_stream_processor )
+            if( type == VSI_NN_KERNEL_TYPE_SP &&
+                _check_stream_process_support(graph, inputs, input_num) == FALSE )
             {
                 continue;
             }
@@ -1661,7 +1669,7 @@ static vsi_bool _check_shader_support(vsi_nn_graph_t* graph)
     return FALSE;
 }
 
-static vsi_bool vsi_nn_kernel_is_asymmtric_int8
+vsi_bool vsi_nn_kernel_is_supported_types
     (
     vsi_nn_tensor_t** inputs,
     size_t input_num,
@@ -1673,25 +1681,45 @@ static vsi_bool vsi_nn_kernel_is_asymmtric_int8
 
     for (i = 0; i < input_num; i++)
     {
-        if ( inputs[i] &&
-             inputs[i]->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
-             inputs[i]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
-           )
+        if ( inputs[i] && vsi_nn_TypeGetBits(inputs[i]->attr.dtype.vx_type) == 4 )
         {
-            return TRUE;
+            return FALSE;
         }
     }
 
     for (i = 0; i < output_num; i++)
     {
-        if ( outputs[i] &&
-             outputs[i]->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
-             outputs[i]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
-           )
+        if ( outputs[i] && vsi_nn_TypeGetBits(outputs[i]->attr.dtype.vx_type) == 4 )
         {
-            return TRUE;
+            return FALSE;
         }
     }
 
-    return FALSE;
+    return TRUE;
+}
+
+static vsi_bool _check_stream_process_support
+    (
+    vsi_nn_graph_t* graph,
+    vsi_nn_tensor_t** inputs,
+    size_t input_num
+    )
+{
+    if ( graph->ctx->config.support_stream_processor == 0 )
+    {
+        return FALSE;
+    }
+
+    if ( graph->ctx->config.sp_exec_count == 0 )
+    {
+        return FALSE;
+    }
+
+    if (inputs && input_num > 0 &&
+        inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32)
+    {
+        return FALSE;
+    }
+
+    return TRUE;
 }
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
index 105027d..d78769e 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c
@@ -653,4 +653,61 @@ vsi_bool vsi_nn_kernel_optimize_group_norm_shape
     }
 
     return status;
-}
\ No newline at end of file
+}
+
+vsi_bool vsi_nn_kernel_optimize_scatter_elements_shape
+    (
+    const vsi_size_t* shape_x, const vsi_size_t rank_x, const int32_t axis,
+    vsi_size_t* out_shape_x, uint32_t* out_rank_x, int32_t* out_axis, vsi_size_t max_size
+    )
+{
+    vsi_bool ret                        = TRUE;
+    vsi_size_t   i                          = 0;
+    vsi_size_t   rank_in                    = 0;
+    vsi_size_t   dims                       = 0;
+    vsi_size_t  innerSize                  = 1;
+    vsi_size_t  outerSize                  = 1;
+    vsi_size_t  axisSize                   = shape_x[axis];
+
+    for (i = 0; i < (size_t)axis; i++)
+    {
+        innerSize *= shape_x[i];
+    }
+
+    for (i = axis + 1; i < rank_x; i++)
+    {
+        outerSize *= shape_x[i];
+    }
+
+    rank_in += element_fill_dim(out_shape_x, rank_in, max_size, innerSize);
+    dims = element_fill_dim(out_shape_x, rank_in, max_size, axisSize);
+    if (dims == 0)
+    {
+        *out_axis = (int32_t)rank_in;
+        out_shape_x[rank_in ++] = 1;
+    }
+    else
+    {
+        *out_axis = (int32_t)rank_in;
+    }
+
+    rank_in += dims;
+
+    rank_in += element_fill_dim(out_shape_x, rank_in, max_size, outerSize);
+
+    if ( 0 == rank_in )
+    {
+        out_shape_x[0] = 1;
+        out_shape_x[1] = 1;
+        rank_in = 2;
+    }
+    else if ( 1 == rank_in )
+    {
+        out_shape_x[1] = 1;
+        rank_in = 2;
+    }
+
+    *out_rank_x = (uint32_t)rank_in;
+
+    return ret;
+} /* vsi_nn_kernel_optimize_scatter_elements_shape() */
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c
index dd32c01..dfdc3dd 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c
@@ -199,6 +199,31 @@ static float softsign_eval(float x)
     return x / (1 + vsi_abs(x));
 }
 
+static float linear_exp_eval(float x, vsi_nn_kernel_lut_params *lut_param)
+{
+    float a = lut_param->params[0];
+    float b = lut_param->params[1];
+
+    return expf(x * a + b);
+}
+
+static float linear_rsqrt_eval(float x, vsi_nn_kernel_lut_params *lut_param)
+{
+    float a = lut_param->params[0];
+    float b = lut_param->params[1];
+    float scale = lut_param->params[2];
+
+    return scale / sqrtf(a * x + b);
+}
+
+static float linear_sigmoid_eval(float x, vsi_nn_kernel_lut_params *lut_param)
+{
+    float a = lut_param->params[0];
+    float b = lut_param->params[1];
+
+    return 1.0f / (1 + expf(a * x + b));;
+}
+
 static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *lut_param)
 {
     float result = 0;
@@ -261,6 +286,15 @@ static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *
     case VSI_NN_KERNEL_LUT_SOFTSIGN:
         result = softsign_eval(data);
         break;
+    case VSI_NN_KERNEL_LUT_LINEAR_EXP:
+        result = linear_exp_eval(data, lut_param);
+        break;
+    case VSI_NN_KERNEL_LUT_LINEAR_RSQRT:
+        result = linear_rsqrt_eval(data, lut_param);
+        break;
+    case VSI_NN_KERNEL_LUT_LINEAR_SIGMOID:
+        result = linear_sigmoid_eval(data, lut_param);
+        break;
     default:
         VSILOGE( "unsupported activation function:%d", lut_param->act_type );
         break;
diff --git a/src/tim/vx/internal/src/kernel/vx/convolutional.c b/src/tim/vx/internal/src/kernel/vx/convolutional.c
index 89c8fa4..2f9be49 100644
--- a/src/tim/vx/internal/src/kernel/vx/convolutional.c
+++ b/src/tim/vx/internal/src/kernel/vx/convolutional.c
@@ -43,7 +43,8 @@ static vsi_bool _build_vx_conv2d_param
     int32_t dilation_h, int32_t dilation_w,
     int32_t multiplier,
     vsi_enum overflow_policy, vsi_enum rounding_policy,
-    vsi_enum down_scale_size_rounding
+    vsi_enum down_scale_size_rounding,
+    vsi_enum pad_mode
     )
 {
     vx_nn_convolution_params_ext_t * p1 = NULL;
@@ -78,6 +79,7 @@ static vsi_bool _build_vx_conv2d_param
     p1->khr.down_scale_size_rounding = (vx_enum)down_scale_size_rounding;
     p1->padding_x_right = (uint32_t)pad_w_end;
     p1->padding_y_bottom = (uint32_t)pad_h_end;
+    p1->pad_mode = (vx_enum)pad_mode;
     param->depth_multiplier = multiplier;
     param->stride_x = (uint32_t)stride_w;
     param->stride_y = (uint32_t)stride_h;
@@ -131,7 +133,8 @@ static vsi_bool _build_vx_conv3d_param
     int32_t dilation_d, int32_t dilation_h, int32_t dilation_w,
     int32_t multiplier,
     vsi_enum overflow_policy, vsi_enum rounding_policy,
-    vsi_enum down_scale_size_rounding
+    vsi_enum down_scale_size_rounding,
+    vsi_enum pad_mode
     )
 {
     VSI_ASSERT( stride_d > 0 );
@@ -176,6 +179,7 @@ static vsi_bool _build_vx_conv3d_param
     param->stride_w = (uint32_t)stride_w;
     param->stride_h = (uint32_t)stride_h;
     param->stride_d = (uint32_t)stride_d;
+    param->pad_mode = (vx_enum)pad_mode;
 
     return TRUE;
 } /* _build_vx_conv2d_param() */
@@ -299,7 +303,8 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d )
             0,
             vsi_nn_kernel_param_get_int32(params, "overflow_policy"),
             vsi_nn_kernel_param_get_int32(params, "rounding_policy"),
-            vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding")
+            vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding"),
+            vsi_nn_kernel_param_get_int32(params, "pad_mode")
             );
 
     temp_tensors[0] = _expand_tensor_dim( inputs[0]->t,
@@ -374,7 +379,8 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
             vsi_nn_kernel_param_get_int32(params, "multiplier"),
             vsi_nn_kernel_param_get_int32(params, "overflow_policy"),
             vsi_nn_kernel_param_get_int32(params, "rounding_policy"),
-            vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding")
+            vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding"),
+            vsi_nn_kernel_param_get_int32(params, "pad_mode")
             );
 
     temp_tensors[0] = _expand_tensor_dim( inputs[0]->t,
@@ -493,7 +499,8 @@ REGISTER_CONV_OPENVX_KERNEL( conv2d )
             0,
             vsi_nn_kernel_param_get_int32(params, "overflow_policy"),
             vsi_nn_kernel_param_get_int32(params, "rounding_policy"),
-            vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding")
+            vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding"),
+            vsi_nn_kernel_param_get_int32(params, "pad_mode")
             );
 
     node = vxConvolutionLayer( graph->g,
@@ -524,7 +531,8 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv2d )
             vsi_nn_kernel_param_get_int32(params, "multiplier"),
             vsi_nn_kernel_param_get_int32(params, "overflow_policy"),
             vsi_nn_kernel_param_get_int32(params, "rounding_policy"),
-            vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding")
+            vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding"),
+            vsi_nn_kernel_param_get_int32(params, "pad_mode")
             );
 
     node = vxConvolutionLayer( graph->g,
@@ -606,7 +614,8 @@ REGISTER_CONV_OPENVX_KERNEL( conv3d )
             vsi_nn_kernel_param_get_int32(params, "depth_multiplier"),
             vsi_nn_kernel_param_get_int32(params, "overflow_policy"),
             vsi_nn_kernel_param_get_int32(params, "rounding_policy"),
-            vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding")
+            vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding"),
+            vsi_nn_kernel_param_get_int32(params, "pad_mode")
             );
 
     node = vxConv3dLayer( graph->g,
diff --git a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
index c6edaaa..fffb3aa 100644
--- a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
@@ -269,4 +269,84 @@ REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( tanh )
     return (vsi_nn_kernel_node_t)node;
 } /* tanh() */
 
+REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( relu1 )
+{
+    vx_node node = NULL;
+
+    node = vxActivationLayer(
+        graph->g,
+        inputs[0]->t,
+        VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU1,
+        0,
+        0,
+        outputs[0]->t
+        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* relu1() */
+
+REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( relu6 )
+{
+    vx_node node = NULL;
+
+    node = vxActivationLayer(
+        graph->g,
+        inputs[0]->t,
+        VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU6,
+        0,
+        0,
+        outputs[0]->t
+        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* relu6() */
+
+REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( rsqrt )
+{
+    vx_node node = NULL;
+
+    node = vxActivationLayer(
+        graph->g,
+        inputs[0]->t,
+        VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RSQRT,
+        0,
+        0,
+        outputs[0]->t
+        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* rsqrt() */
+
+REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( sqrt )
+{
+    vx_node node = NULL;
+
+    node = vxActivationLayer(
+        graph->g,
+        inputs[0]->t,
+        VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SQRT,
+        0,
+        0,
+        outputs[0]->t
+        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* sqrt() */
+
+REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( softrelu )
+{
+    vx_node node = NULL;
+
+    node = vxActivationLayer(
+        graph->g,
+        inputs[0]->t,
+        VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SOFTRELU,
+        0,
+        0,
+        outputs[0]->t
+        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* softrelu() */
+
 #undef REGISTER_ELTWISE_UNARY_OPENVX_KERNEL
diff --git a/src/tim/vx/internal/src/kernel/vx/pad2_vx.c b/src/tim/vx/internal/src/kernel/vx/pad2_vx.c
index d67751b..a458e38 100644
--- a/src/tim/vx/internal/src/kernel/vx/pad2_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/pad2_vx.c
@@ -65,6 +65,7 @@ REGISTER_PAD2_OPENVX_KERNEL( pad2 )
     int32_t pad_front_array[VSI_NN_MAX_DIM_NUM] = {0};
     int32_t pad_back_array[VSI_NN_MAX_DIM_NUM] = {0};
     vsi_nn_tensor_t *convert_tensor = NULL;
+    vsi_bool release_intermediate_tensor = TRUE;
     float const_val = vsi_nn_kernel_param_get_float32(params, "const_val");
 
     memset(&param, 0, sizeof(param));
@@ -98,14 +99,18 @@ REGISTER_PAD2_OPENVX_KERNEL( pad2 )
     }
     else
     {
-        convert_tensor = vsi_nn_reshape_tensor( graph,
-            inputs[0], inputs[0]->attr.size, inputs[0]->attr.dim_num );
+        convert_tensor = inputs[0];
+        release_intermediate_tensor = FALSE;
     }
 
     node = vxTensorPadNode( graph->g, convert_tensor->t, outputs[0]->t, &param, sizeof(param) );
 
     vxReleaseScalar( &param.pad_const );
-    vsi_safe_release_tensor(convert_tensor);
+
+    if (release_intermediate_tensor)
+    {
+        vsi_safe_release_tensor(convert_tensor);
+    }
 
     return (vsi_nn_kernel_node_t)node;
 } /* pad2() */
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/bucketize.cl b/src/tim/vx/internal/src/libnnext/ops/cl/bucketize.cl
new file mode 100644
index 0000000..4b7ab04
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/bucketize.cl
@@ -0,0 +1,281 @@
+#pragma OPENCL EXTENSION CL_VIV_asm : enable
+
+#define BUCKETIZE_F32_2D_SH_IMPL(name, comp_op) \
+__kernel void bucketize_##name \
+    ( \
+    __read_only  image2d_t input, \
+    __read_only  image2d_t boundaries, \
+    __write_only image2d_t output, \
+                 int       boundaries_size, \
+                 float     input0_scale, \
+                 float     input0_tail, \
+                 float     input1_scale, \
+                 float     input1_tail \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    float4 src0 = read_imagef(input, coord); \
+ \
+    int2 pos = 0; \
+    do \
+    { \
+        float4 src1 = read_imagef(boundaries, pos); \
+        if ((src0.x) comp_op (src1.x)) \
+        { \
+            break; \
+        } \
+        pos.x ++; \
+    } while(pos.x < boundaries_size); \
+ \
+    write_imagei(output, coord, pos.xxxx); \
+}
+BUCKETIZE_F32_2D_SH_IMPL(F32_F32toI32_2D,       <=)
+BUCKETIZE_F32_2D_SH_IMPL(right_F32_F32toI32_2D, <)
+
+#define BUCKETIZE_F32_SH_IMPL(name, comp_op) \
+__kernel void bucketize_##name \
+    ( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       boundaries, \
+    __write_only image2d_array_t output, \
+                 int             boundaries_size, \
+                 float           input0_scale, \
+                 float           input0_tail, \
+                 float           input1_scale, \
+                 float           input1_tail \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    float4 src0 = read_imagef(input, coord); \
+ \
+    int2 pos = 0; \
+    do \
+    { \
+        float4 src1 = read_imagef(boundaries, pos); \
+        if ((src0.x) comp_op (src1.x)) \
+        { \
+            break; \
+        } \
+        pos.x ++; \
+    } while(pos.x < boundaries_size); \
+ \
+    write_imagei(output, coord, pos.xxxx); \
+}
+BUCKETIZE_F32_SH_IMPL(F32_F32toI32,       <=)
+BUCKETIZE_F32_SH_IMPL(right_F32_F32toI32, <)
+
+#define BUCKETIZE_I32_2D_SH_IMPL(name, comp_op) \
+__kernel void bucketize_##name \
+    ( \
+    __read_only  image2d_t input, \
+    __read_only  image2d_t boundaries, \
+    __write_only image2d_t output, \
+                 int       boundaries_size, \
+                 float     input0_scale, \
+                 float     input0_tail, \
+                 float     input1_scale, \
+                 float     input1_tail \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    float4 src0 = convert_float4(read_imagei(input, coord)); \
+ \
+    int2 pos = 0; \
+    src0 = src0 * input0_scale + input0_tail; \
+    do \
+    { \
+        float4 src1 = convert_float4(read_imagei(boundaries, pos)); \
+        src1 = src1 * input1_scale + input1_tail; \
+        if ((src0.x) comp_op (src1.x)) \
+        { \
+            break; \
+        } \
+        pos.x ++; \
+    } while(pos.x < boundaries_size); \
+ \
+    write_imagei(output, coord, pos.xxxx); \
+}
+BUCKETIZE_I32_2D_SH_IMPL(I32_I32toI32_2D,       <=)
+BUCKETIZE_I32_2D_SH_IMPL(right_I32_I32toI32_2D, <)
+
+#define BUCKETIZE_I32_SH_IMPL(name, comp_op) \
+__kernel void bucketize_##name \
+    ( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       boundaries, \
+    __write_only image2d_array_t output, \
+                 int             boundaries_size, \
+                 float           input0_scale, \
+                 float           input0_tail, \
+                 float           input1_scale, \
+                 float           input1_tail \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    int4 data = read_imagei(input, coord); \
+    float4 src0 = convert_float4(data) * input0_scale + input0_tail; \
+ \
+    int2 pos = 0; \
+    do \
+    { \
+        float4 src1 = convert_float4(read_imagei(boundaries, pos)); \
+        src1 = src1 * input1_scale + input1_tail; \
+        if ((src0.x) comp_op (src1.x)) \
+        { \
+            break; \
+        } \
+        pos.x ++; \
+    } while(pos.x < boundaries_size); \
+ \
+    write_imagei(output, coord, pos.xxxx); \
+}
+BUCKETIZE_I32_SH_IMPL(I32_I32toI32,       <=)
+BUCKETIZE_I32_SH_IMPL(right_I32_I32toI32, <)
+
+#define BUCKETIZE_U32_2D_SH_IMPL(name, comp_op) \
+__kernel void bucketize_##name \
+    ( \
+    __read_only  image2d_t input, \
+    __read_only  image2d_t boundaries, \
+    __write_only image2d_t output, \
+                 int       boundaries_size, \
+                 float     input0_scale, \
+                 float     input0_tail, \
+                 float     input1_scale, \
+                 float     input1_tail \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    float4 src0 = convert_float4(read_imageui(input, coord)); \
+ \
+    int2 pos = 0; \
+    src0 = src0 * input0_scale + input0_tail; \
+    do \
+    { \
+        float4 src1 = convert_float4(read_imageui(boundaries, pos)); \
+        src1 = src1 * input1_scale + input1_tail; \
+        if ((src0.x) comp_op (src1.x)) \
+        { \
+            break; \
+        } \
+        pos.x ++; \
+    } while(pos.x < boundaries_size); \
+ \
+    write_imagei(output, coord, pos.xxxx); \
+}
+BUCKETIZE_U32_2D_SH_IMPL(U32_U32toI32_2D,       <=)
+BUCKETIZE_U32_2D_SH_IMPL(right_U32_U32toI32_2D, <)
+
+#define BUCKETIZE_U32_SH_IMPL(name, comp_op) \
+__kernel void bucketize_##name \
+    ( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       boundaries, \
+    __write_only image2d_array_t output, \
+                 int             boundaries_size, \
+                 float           input0_scale, \
+                 float           input0_tail, \
+                 float           input1_scale, \
+                 float           input1_tail \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    uint4 data = read_imageui(input, coord); \
+    float4 src0 = convert_float4(data) * input0_scale + input0_tail; \
+ \
+    int2 pos = 0; \
+    do \
+    { \
+        float4 src1 = convert_float4(read_imageui(boundaries, pos)); \
+        src1 = src1 * input1_scale + input1_tail; \
+        if ((src0.x) comp_op (src1.x)) \
+        { \
+            break; \
+        } \
+        pos.x ++; \
+    } while(pos.x < boundaries_size); \
+ \
+    write_imagei(output, coord, pos.xxxx); \
+}
+BUCKETIZE_U32_SH_IMPL(U32_U32toI32,       <=)
+BUCKETIZE_U32_SH_IMPL(right_U32_U32toI32, <)
+
+#define BUCKETIZE_BF16_2D_SH_IMPL(name, comp_op) \
+__kernel void bucketize_##name \
+    ( \
+    __read_only  image2d_t input, \
+    __read_only  image2d_t boundaries, \
+    __write_only image2d_t output, \
+                 int       boundaries_size, \
+                 float     input0_scale, \
+                 float     input0_tail, \
+                 float     input1_scale, \
+                 float     input1_tail \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    uint4 data0 = read_imageui(input, coord) << 16; \
+    float4 src0; \
+    _viv_asm(COPY, src0, data0, 16); \
+ \
+    int2 pos = 0; \
+    do \
+    { \
+        uint4 data1 = read_imageui(boundaries, pos) << 16; \
+        float4 src1; \
+        _viv_asm(COPY, src1, data1, 16); \
+        if ((src0.x) comp_op (src1.x)) \
+        { \
+            break; \
+        } \
+        pos.x ++; \
+    } while(pos.x < boundaries_size); \
+ \
+    write_imagei(output, coord, pos.xxxx); \
+}
+BUCKETIZE_BF16_2D_SH_IMPL(BF16_BF16toI32_2D,       <=)
+BUCKETIZE_BF16_2D_SH_IMPL(right_BF16_BF16toI32_2D, <)
+
+#define BUCKETIZE_BF16_SH_IMPL(name, comp_op) \
+__kernel void bucketize_##name \
+    ( \
+    __read_only  image2d_array_t input, \
+    __read_only  image2d_t       boundaries, \
+    __write_only image2d_array_t output, \
+                 int             boundaries_size, \
+                 float           input0_scale, \
+                 float           input0_tail, \
+                 float           input1_scale, \
+                 float           input1_tail \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    uint4 data0 = read_imageui(input, coord) << 16; \
+    float4 src0; \
+    _viv_asm(COPY, src0, data0, 16); \
+ \
+    int2 pos = 0; \
+    do \
+    { \
+        uint4 data1 = read_imageui(boundaries, pos) << 16; \
+        float4 src1; \
+        _viv_asm(COPY, src1, data1, 16); \
+        if ((src0.x) comp_op (src1.x)) \
+        { \
+            break; \
+        } \
+        pos.x ++; \
+    } while(pos.x < boundaries_size); \
+ \
+    write_imagei(output, coord, pos.xxxx); \
+}
+BUCKETIZE_BF16_SH_IMPL(BF16_BF16toI32,       <=)
+BUCKETIZE_BF16_SH_IMPL(right_BF16_BF16toI32, <)
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/lppool.cl b/src/tim/vx/internal/src/libnnext/ops/cl/lppool.cl
new file mode 100644
index 0000000..64068c2
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/lppool.cl
@@ -0,0 +1,115 @@
+
+#define LPPOOL_PROCESS(src_type, dst_type, readimage_type, conv_mode, writeimage_type) \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    int hstart = gidy * stride_y - pad_top; \
+    int wstart = gidx * stride_x - pad_left; \
+    int hend = min(hstart + ksize_y, height); \
+    int wend = min(wstart + ksize_x, width); \
+    int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0); \
+    int4 coord_in  = coord_out; \
+    int h, w; \
+    float sum_of_pow = 0; \
+    dst_type out_data = (dst_type)(0); \
+    src_type in_data; \
+    float in_f32, out_f32; \
+    hstart = max(hstart, 0); \
+    wstart = max(wstart, 0); \
+    for (h = hstart; h < hend; h++) \
+    { \
+        for (w = wstart; w < wend; w++) \
+        { \
+            coord_in.xy = (int2)(w, h); \
+            in_data = readimage_type(input, coord_in).x; \
+            in_f32 = convert_float(in_data) * inputScale + inputTail; \
+            sum_of_pow += pow(fabs(in_f32),p); \
+        } \
+    } \
+    out_f32 = pow(sum_of_pow, 1.0f / p) * outputScale + outputTail; \
+    out_data.x = conv_mode(out_f32); \
+    writeimage_type(output, coord_out, out_data); \
+
+#define TENSOR_LPPOOL(src_name, dst_name, src_type, dst_type, readimage_type, conv_mode, writeimage_type) \
+__kernel void lppool_##src_name##to##dst_name ( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+                 int              ksize_x, \
+                 int              ksize_y, \
+                 int              stride_x, \
+                 int              stride_y, \
+                 int              pad_left, \
+                 int              pad_top, \
+                 int              p, \
+                 int              width, \
+                 int              height, \
+                 float            inputScale, \
+                 float            inputTail, \
+                 float            outputScale, \
+                 float            outputTail) \
+{ \
+    LPPOOL_PROCESS(src_type, dst_type, readimage_type, conv_mode, writeimage_type); \
+}
+
+TENSOR_LPPOOL(F32, F32, float, float4, read_imagef, convert_float, write_imagef)
+TENSOR_LPPOOL(F32, U32, float, uint4,  read_imagef, convert_uint,  write_imageui)
+TENSOR_LPPOOL(F32, I32, float, int4,   read_imagef, convert_int,   write_imagei)
+
+TENSOR_LPPOOL(U32, U32, uint, uint4,  read_imageui, convert_uint,  write_imageui)
+TENSOR_LPPOOL(U32, F32, uint, float4, read_imageui, convert_float, write_imagef)
+TENSOR_LPPOOL(U32, I32, uint, int4,   read_imageui, convert_int,   write_imagei)
+
+TENSOR_LPPOOL(I32, I32, int, int4,    read_imagei, convert_int,   write_imagei)
+TENSOR_LPPOOL(I32, F32, int, float4, read_imagei, convert_float, write_imagef)
+TENSOR_LPPOOL(I32, U32, int, uint4,  read_imagei, convert_uint,  write_imageui)
+
+__kernel void lppool_BF16toBF16(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+                 int              ksize_x,
+                 int              ksize_y,
+                 int              stride_x,
+                 int              stride_y,
+                 int              pad_left,
+                 int              pad_top,
+                 int              p,
+                 int              width,
+                 int              height,
+                 float            inputScale,
+                 float            inputTail,
+                 float            outputScale,
+                 float            outputTail)
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int hstart = gidy * stride_y - pad_top;
+    int wstart = gidx * stride_x - pad_left;
+    int hend = min(hstart + ksize_y, height);
+    int wend = min(wstart + ksize_x, width);
+    int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0);
+    int4 coord_in  = coord_out;
+    int h, w;
+    float sum_of_pow = 0;
+    float out_data_f32 = 0;
+    uint4 dst = (uint4)(0);
+    float4 data_f32 = (float4)(0);
+    uint4 data;
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+
+    for (h = hstart; h < hend; h++)
+    {
+        for (w = wstart; w < wend; w++)
+        {
+            coord_in.xy = (int2)(w, h);
+            data = read_imageui(input, coord_in);
+            data = data << 16;
+            _viv_asm(COPY, data_f32, data, 16);
+            sum_of_pow += pow(abs(data_f32.x),p);
+        }
+    }
+    out_data_f32 = pow(sum_of_pow, 1.0f / p);
+    _viv_asm(COPY, dst, out_data_f32, 4);
+    dst.x = dst.x >> 16;
+    write_imageui(output, coord_out, dst);
+}
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/maximum.cl b/src/tim/vx/internal/src/libnnext/ops/cl/maximum.cl
index 08e66a7..cf0b2b5 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/maximum.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/maximum.cl
@@ -124,7 +124,7 @@ __kernel void maximum_I32I32toI32
     float4 data0 = convert_float4(src0) * input0Scale - input0Tail;
     float4 data1 = convert_float4(src1) * input1Scale - input1Tail;
     float4 data = data0 > data1 ? data0 : data1;
-    int4 dst = convert_int4(data * outputScale + outputZP);
+    int4 dst = convert_int4_rte(data * outputScale + outputZP);
 
     write_imagei(output, coord, dst);
 }
@@ -150,7 +150,7 @@ __kernel void maximum_I32I32toI32_2D
     float4 data0 = convert_float4(src0) * input0Scale - input0Tail;
     float4 data1 = convert_float4(src1) * input1Scale - input1Tail;
     float4 data = data0 > data1 ? data0 : data1;
-    int4 dst = convert_int4(data * outputScale + outputZP);
+    int4 dst = convert_int4_rte(data * outputScale + outputZP);
 
     write_imagei(output, coord, dst);
 }
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/minimum.cl b/src/tim/vx/internal/src/libnnext/ops/cl/minimum.cl
index 27c6501..f02044b 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/minimum.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/minimum.cl
@@ -124,7 +124,7 @@ __kernel void minimum_I32I32toI32
     float4 data0 = convert_float4(src0) * input0Scale - input0Tail;
     float4 data1 = convert_float4(src1) * input1Scale - input1Tail;
     float4 data = data0 < data1 ? data0 : data1;
-    int4 dst = convert_int4(data * outputScale + outputZP);
+    int4 dst = convert_int4_rte(data * outputScale + outputZP);
 
     write_imagei(output, coord, dst);
 }
@@ -150,7 +150,7 @@ __kernel void minimum_I32I32toI32_2D
     float4 data0 = convert_float4(src0) * input0Scale - input0Tail;
     float4 data1 = convert_float4(src1) * input1Scale - input1Tail;
     float4 data = data0 < data1 ? data0 : data1;
-    int4 dst = convert_int4(data * outputScale + outputZP);
+    int4 dst = convert_int4_rte(data * outputScale + outputZP);
 
     write_imagei(output, coord, dst);
 }
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl b/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl
index b2d6aae..91b10d9 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/roi_align.cl
@@ -1,12 +1,14 @@
+
 inline float roi_align_1x1
 (
     __read_only  image2d_array_t  input,
-                           float2 region_start,
-                           float2 region_end,
-                           float2 bin_size,
-                           int2   grid_size,
-                           float2 rcp_of_grid_size,
-                           int    pz
+                 float2 region_start,
+                 float2 region_end,
+                 float2 bin_size,
+                 int2   grid_size,
+                 float2 rcp_of_grid_size,
+                 int    pz,
+                 int4   max_spatial_dims
 )
 {
     float sum = 0;
@@ -21,15 +23,24 @@ inline float roi_align_1x1
             int2 xy_low  = convert_int2(pos);
             int2 xy_high = xy_low + 1;
 
-            float ly = pos.y - xy_low.y;
-            float lx = pos.x - xy_low.x;
-            float hy = 1.0f - ly;
-            float hx = 1.0f - lx;
+            if (xy_low.x > max_spatial_dims.x || max_spatial_dims.x < -1 ||
+                xy_low.y > max_spatial_dims.y || max_spatial_dims.y < -1 )
+            {
+                continue;
+            }
+
+            float2 lxy = pos - floor(pos);
+            float2 zero = 0;
+
+            lxy = xy_low >= max_spatial_dims.zw ? 0.0 : lxy;
+
+            float hy = 1.0f - lxy.y;
+            float hx = 1.0f - lxy.x;
 
             float w1 = hy * hx;
-            float w2 = hy * lx;
-            float w3 = ly * hx;
-            float w4 = ly * lx;
+            float w2 = lxy.x - lxy.x * lxy.y;
+            float w3 = lxy.y - lxy.x * lxy.y;
+            float w4 = lxy.y * lxy.x;
 
             float data1 = read_imagef(input, (int4)(xy_low.x, xy_low.y, pz, 0)).x;
             float data2 = read_imagef(input, (int4)(xy_high.x, xy_low.y, pz, 0)).x;
@@ -43,8 +54,9 @@ inline float roi_align_1x1
     return (float)(sum * rcp_of_grid_size.x * rcp_of_grid_size.y);
 }
 
-
 #define EPS_GRID 0.00001f
+#define TYPE_FLOAT16    (1)
+#define TYPE_FLOAT32    (2)
 __kernel void roi_align_F32_F32toF32
 (
     __read_only  image2d_array_t input,
@@ -57,13 +69,14 @@ __kernel void roi_align_F32_F32toF32
                  float           output_zp,
                  float           spatial_x_scale,
                  float           spatial_y_scale,
-                 float           in_width,
-                 float           in_height,
+                 int             in_width,
+                 int             in_height,
                  float           rcp_of_out_width,
                  float           rcp_of_out_height,
                  float           sampling_x_ratio,
                  float           sampling_y_ratio,
-                 int             depth
+                 int             depth,
+                 int             dtype
 )
 {
     int px = get_global_id(0);
@@ -82,7 +95,10 @@ __kernel void roi_align_F32_F32toF32
 
     float2 spatial_indx     = (float2)(px, py);
     float2 pooled_dims      = (float2)(rcp_of_out_width, rcp_of_out_height);
-    float2 max_spatial_dims = (float2)(in_width, in_height);
+    int4 max_spatial_dims   = (int4)(in_width, in_height, in_width, in_height);
+    max_spatial_dims.zw = max_spatial_dims.zw - 1;
+
+    float2 max_limiatation = convert_float2(max_spatial_dims.zw);
 
     float2 bin_size     = roi_dims * pooled_dims;
     float2 region_start = spatial_indx * bin_size + roi_anchor.xy;
@@ -105,9 +121,28 @@ __kernel void roi_align_F32_F32toF32
                        bin_size,
                        grid_size_xy,
                        rcp_of_grid_size,
-                       kz);
+                       kz,
+                       max_spatial_dims);
 
-        write_imagef(output, (int4)(px, py, kz1, 0), interp);
+        if (dtype == TYPE_FLOAT16)
+        {
+            half tmp;
+            short dst;
+            _viv_asm(CONV, tmp, interp.x);
+            _viv_asm(COPY, dst, tmp, 2);
+
+            Tensor out_t =  create_tensor_from_image2d_array(output, 2);
+            short *output_ptr = (short *)get_tensor_ptr_from_coord(out_t, (int4)(px, py, kz1, 0));
+
+            output_ptr[0] = dst;
+        }
+        else
+        {
+            Tensor out_t =  create_tensor_from_image2d_array(output, 4);
+            float *output_ptr = (float *)get_tensor_ptr_from_coord(out_t, (int4)(px, py, kz1, 0));
+
+            output_ptr[0] = interp.x;
+        }
     }
 }
 
@@ -121,7 +156,8 @@ inline float roi_align_1x1_U8toF32
                 float2           bin_size,
                 int2             grid_size,
                 float2           rcp_of_grid_size,
-                int              pz
+                int              pz,
+                int4             max_spatial_dims
 )
 {
     float sum = 0;
@@ -132,33 +168,43 @@ inline float roi_align_1x1_U8toF32
         {
             float2 ixy = (float2)(ix + 0.5f, iy + 0.5f);
             float2 pos = region_start + ixy * bin_size * rcp_of_grid_size;
-
+    
             int2 xy_low  = convert_int2(pos);
             int2 xy_high = xy_low + 1;
-
-            float ly = pos.y - xy_low.y;
-            float lx = pos.x - xy_low.x;
-            float hy = 1.0f - ly;
-            float hx = 1.0f - lx;
-
+    
+            float2 lxy = pos - floor(pos);
+            float2 zero = 0;
+    
+            if (xy_low.x > max_spatial_dims.x || max_spatial_dims.x < -1 ||
+                xy_low.y > max_spatial_dims.y || max_spatial_dims.y < -1 )
+            {
+                continue;
+            }
+    
+            lxy = xy_low >= max_spatial_dims.zw ? 0.0 : lxy;
+    
+            float hy = 1.0f - lxy.y;
+            float hx = 1.0f - lxy.x;
+    
             float w1 = hy * hx;
-            float w2 = hy * lx;
-            float w3 = ly * hx;
-            float w4 = ly * lx;
-
+            float w2 = lxy.x - lxy.x * lxy.y;
+            float w3 = lxy.y - lxy.x * lxy.y;
+            float w4 = lxy.y * lxy.x;
+    
             uint4 data;
             data.x = read_imageui(input, (int4)(xy_low.x, xy_low.y, pz, 0)).x;
             data.y = read_imageui(input, (int4)(xy_high.x, xy_low.y, pz, 0)).x;
             data.z = read_imageui(input, (int4)(xy_low.x, xy_high.y, pz, 0)).x;
             data.w = read_imageui(input, (int4)(xy_high.x, xy_high.y, pz, 0)).x;
-
+    
             float4 value = convert_float4(data) * input_scale + input_tail;
-
+    
             sum = sum + w1 * value.x + w2 * value.y + w3 * value.z + w4 * value.w;
         }
     }
-
+    
     return (float)(sum * rcp_of_grid_size.x * rcp_of_grid_size.y);
+
 }
 
 __kernel void roi_align_U8_U16toU8
@@ -173,13 +219,14 @@ __kernel void roi_align_U8_U16toU8
                  float           output_zp,
                  float           spatial_x_scale,
                  float           spatial_y_scale,
-                 float           in_width,
-                 float           in_height,
+                 int             in_width,
+                 int             in_height,
                  float           rcp_of_out_width,
                  float           rcp_of_out_height,
                  float           sampling_x_ratio,
                  float           sampling_y_ratio,
-                 int             depth
+                 int             depth,
+                 int             dtype
 )
 {
     int px = get_global_id(0);
@@ -198,7 +245,10 @@ __kernel void roi_align_U8_U16toU8
 
     float2 spatial_indx     = (float2)(px, py);
     float2 pooled_dims      = (float2)(rcp_of_out_width, rcp_of_out_height);
-    float2 max_spatial_dims = (float2)(in_width, in_height);
+    int4 max_spatial_dims   = (int4)(in_width, in_height, in_width, in_height);
+    max_spatial_dims.zw = max_spatial_dims.zw - 1;
+
+    float2 max_limiatation = convert_float2(max_spatial_dims.zw);
 
     float2 bin_size     = roi_dims * pooled_dims;
     float2 region_start = spatial_indx * bin_size + roi_anchor.xy;
@@ -223,12 +273,17 @@ __kernel void roi_align_U8_U16toU8
                        bin_size,
                        grid_size_xy,
                        rcp_of_grid_size,
-                       kz);
+                       kz,
+                       max_spatial_dims);
 
-        uint4 dst;
+        uchar dst;
         interp.x = interp.x * output_scale + output_zp;
         interp.x = interp.x < 255 ? interp.x : 255;
-        dst.x = convert_uint_rte(interp.x);
-        write_imageui(output, (int4)(px, py, kz1, 0), dst.xxxx);
+        dst = convert_uchar_rte(interp.x);
+
+        Tensor out_t =  create_tensor_from_image2d_array(output, 1);
+        uchar *output_ptr = (uchar *)get_tensor_ptr_from_coord(out_t, (int4)(px, py, kz1, 0));
+        
+        output_ptr[0] = dst;
     }
 }
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements.cl b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements.cl
new file mode 100644
index 0000000..bb148dd
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements.cl
@@ -0,0 +1,298 @@
+
+#define SCATTER_ELEMENTS_AXIS0_32BITS_IMPL(name, dtype) \
+__kernel void scatter_elements_axis0_##name \
+    ( \
+    __read_only  image2d_t ref, \
+    __read_only  image2d_t indices, \
+    __read_only  image2d_t update, \
+    __write_only image2d_t output, \
+                 int       axis, \
+                 int       reduction, \
+                 float     ref_scale, \
+                 float     ref_tail, \
+                 float     update_scale, \
+                 float     update_tail, \
+                 float     output_zp, \
+                 int       inner_size, \
+                 int       axis_size, \
+                 int       outer_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
+ \
+    Image ref_i = create_image_from_image2d(ref, 4); \
+    Image update_i = create_image_from_image2d(update, 4); \
+    Image indices_i = create_image_from_image2d(indices, 4); \
+    Image output_i = create_image_from_image2d(output, 4); \
+ \
+    dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \
+    dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \
+    dtype data = ref_ptr[0]; \
+    if (coord.y < outer_size) \
+    { \
+        dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \
+        int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \
+        for(int x = 0; x < axis_size; x ++) \
+        { \
+            int offset = indices_ptr[x]; \
+            if (offset == coord.x) \
+            { \
+                data = update_ptr[x]; \
+                break; \
+            } \
+        } \
+    } \
+ \
+    output_ptr[0] = data; \
+}
+SCATTER_ELEMENTS_AXIS0_32BITS_IMPL(F32_I32_F32toF32, float)
+SCATTER_ELEMENTS_AXIS0_32BITS_IMPL(I32_I32_I32toI32, int)
+
+#define SCATTER_ELEMENTS_AXIS0_16BITS_IMPL(name, dtype, conv_func) \
+__kernel void scatter_elements_axis0_##name \
+    ( \
+    __read_only  image2d_t ref, \
+    __read_only  image2d_t indices, \
+    __read_only  image2d_t update, \
+    __write_only image2d_t output, \
+                 int       axis, \
+                 int       reduction, \
+                 float     ref_scale, \
+                 float     ref_tail, \
+                 float     update_scale, \
+                 float     update_tail, \
+                 float     output_zp, \
+                 int       inner_size, \
+                 int       axis_size, \
+                 int       outer_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
+ \
+    Image ref_i = create_image_from_image2d(ref, 2); \
+    Image update_i = create_image_from_image2d(update, 2); \
+    Image indices_i = create_image_from_image2d(indices, 4); \
+    Image output_i = create_image_from_image2d(output, 2); \
+ \
+    dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \
+    dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
+    if (coord.y < outer_size) \
+    { \
+        dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \
+        int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \
+        for(int x = 0; x < axis_size; x ++) \
+        { \
+            int offset = indices_ptr[x]; \
+            if (offset == coord.x) \
+            { \
+                data = conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \
+                break; \
+            } \
+        } \
+    } \
+ \
+    output_ptr[0] = data; \
+}
+SCATTER_ELEMENTS_AXIS0_16BITS_IMPL(I16_I32_I16toI16,    short,  convert_short_rte)
+SCATTER_ELEMENTS_AXIS0_16BITS_IMPL(F16_I32_F16toF16,    short,  convert_short)
+SCATTER_ELEMENTS_AXIS0_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)
+
+#define SCATTER_ELEMENTS_AXIS0_8BITS_IMPL(name, dtype, conv_func) \
+__kernel void scatter_elements_axis0_##name \
+    ( \
+    __read_only  image2d_t ref, \
+    __read_only  image2d_t indices, \
+    __read_only  image2d_t update, \
+    __write_only image2d_t output, \
+                 int       axis, \
+                 int       reduction, \
+                 float     ref_scale, \
+                 float     ref_tail, \
+                 float     update_scale, \
+                 float     update_tail, \
+                 float     output_zp, \
+                 int       inner_size, \
+                 int       axis_size, \
+                 int       outer_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
+ \
+    Image ref_i = create_image_from_image2d(ref, 1); \
+    Image update_i = create_image_from_image2d(update, 1); \
+    Image indices_i = create_image_from_image2d(indices, 4); \
+    Image output_i = create_image_from_image2d(output, 1); \
+ \
+    dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \
+    dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
+    if (coord.y < outer_size) \
+    { \
+        dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \
+        int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \
+        for(int x = 0; x < axis_size; x ++) \
+        { \
+            int offset = indices_ptr[x]; \
+            if (offset == coord.x) \
+            { \
+                data = conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \
+                break; \
+            } \
+        } \
+    } \
+ \
+    output_ptr[0] = data; \
+}
+SCATTER_ELEMENTS_AXIS0_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)
+SCATTER_ELEMENTS_AXIS0_8BITS_IMPL(I8_I32_I8toI8, char,  convert_char)
+
+#define SCATTER_ELEMENTS_AXIS1_32BITS_IMPL(name, dtype) \
+__kernel void scatter_elements_axis1_##name \
+    ( \
+    __read_only  image2d_array_t ref, \
+    __read_only  image2d_array_t indices, \
+    __read_only  image2d_array_t update, \
+    __write_only image2d_array_t output, \
+                 int             axis, \
+                 int             reduction, \
+                 float           ref_scale, \
+                 float           ref_tail, \
+                 float           update_scale, \
+                 float           update_tail, \
+                 float           output_zp, \
+                 int             inner_size, \
+                 int             axis_size, \
+                 int             outer_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    Tensor ref_i = create_tensor_from_image2d_array(ref, 4); \
+    Tensor update_i = create_tensor_from_image2d_array(update, 4); \
+    Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \
+    Tensor output_i = create_tensor_from_image2d_array(output, 4); \
+ \
+    dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \
+    dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \
+    dtype data = ref_ptr[0]; \
+    if (coord.x < inner_size && coord.z < outer_size) \
+    { \
+        dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \
+        int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \
+        for(int y = 0; y < axis_size; y ++) \
+        { \
+            int offset = indices_ptr[y * inner_size]; \
+            if (offset == coord.y) \
+            { \
+                data = update_ptr[y * inner_size]; \
+                break; \
+            } \
+        } \
+    } \
+ \
+    output_ptr[0] = data; \
+}
+SCATTER_ELEMENTS_AXIS1_32BITS_IMPL(F32_I32_F32toF32, float)
+SCATTER_ELEMENTS_AXIS1_32BITS_IMPL(I32_I32_I32toI32, int)
+
+#define SCATTER_ELEMENTS_AXIS1_16BITS_IMPL(name, dtype, conv_func) \
+__kernel void scatter_elements_axis1_##name \
+    ( \
+    __read_only  image2d_array_t ref, \
+    __read_only  image2d_array_t indices, \
+    __read_only  image2d_array_t update, \
+    __write_only image2d_array_t output, \
+                 int             axis, \
+                 int             reduction, \
+                 float           ref_scale, \
+                 float           ref_tail, \
+                 float           update_scale, \
+                 float           update_tail, \
+                 float           output_zp, \
+                 int             inner_size, \
+                 int             axis_size, \
+                 int             outer_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    Tensor ref_i = create_tensor_from_image2d_array(ref, 2); \
+    Tensor update_i = create_tensor_from_image2d_array(update, 2); \
+    Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \
+    Tensor output_i = create_tensor_from_image2d_array(output, 2); \
+ \
+    dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \
+    dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
+    if (coord.x < inner_size && coord.z < outer_size) \
+    { \
+        dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \
+        int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \
+        for(int y = 0; y < axis_size; y ++) \
+        { \
+            int offset = indices_ptr[y * inner_size]; \
+            if (offset == coord.y) \
+            { \
+                data = conv_func(convert_float(update_ptr[y * inner_size]) \
+                            * update_scale + update_tail + output_zp); \
+                break; \
+            } \
+        } \
+    } \
+ \
+    output_ptr[0] = data; \
+}
+SCATTER_ELEMENTS_AXIS1_16BITS_IMPL(I16_I32_I16toI16,    short,  convert_short_rte)
+SCATTER_ELEMENTS_AXIS1_16BITS_IMPL(F16_I32_F16toF16,    short,  convert_short)
+SCATTER_ELEMENTS_AXIS1_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)
+
+#define SCATTER_ELEMENTS_AXIS1_8BITS_IMPL(name, dtype, conv_func) \
+__kernel void scatter_elements_axis1_##name \
+    ( \
+    __read_only  image2d_array_t ref, \
+    __read_only  image2d_array_t indices, \
+    __read_only  image2d_array_t update, \
+    __write_only image2d_array_t output, \
+                 int             axis, \
+                 int             reduction, \
+                 float           ref_scale, \
+                 float           ref_tail, \
+                 float           update_scale, \
+                 float           update_tail, \
+                 float           output_zp, \
+                 int             inner_size, \
+                 int             axis_size, \
+                 int             outer_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    Tensor ref_i = create_tensor_from_image2d_array(ref, 1); \
+    Tensor update_i = create_tensor_from_image2d_array(update, 1); \
+    Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \
+    Tensor output_i = create_tensor_from_image2d_array(output, 1); \
+ \
+    dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \
+    dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
+    if (coord.x < inner_size && coord.z < outer_size) \
+    { \
+        dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \
+        int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \
+        for(int y = 0; y < axis_size; y ++) \
+        { \
+            int offset = indices_ptr[y * inner_size]; \
+            if (offset == coord.y) \
+            { \
+                data = conv_func(convert_float(update_ptr[y * inner_size]) \
+                                * update_scale + update_tail + output_zp); \
+                break; \
+            } \
+        } \
+    } \
+ \
+    output_ptr[0] = data; \
+}
+SCATTER_ELEMENTS_AXIS1_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)
+SCATTER_ELEMENTS_AXIS1_8BITS_IMPL(I8_I32_I8toI8, char,  convert_char)
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements_add.cl b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements_add.cl
new file mode 100644
index 0000000..a7c67f5
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements_add.cl
@@ -0,0 +1,292 @@
+
+#define SE_ADD_AXIS0_32BITS_IMPL(name, dtype) \
+__kernel void scatter_elements_add_axis0_##name \
+    ( \
+    __read_only  image2d_t ref, \
+    __read_only  image2d_t indices, \
+    __read_only  image2d_t update, \
+    __write_only image2d_t output, \
+                 int       axis, \
+                 int       reduction, \
+                 float     ref_scale, \
+                 float     ref_tail, \
+                 float     update_scale, \
+                 float     update_tail, \
+                 float     output_zp, \
+                 int       inner_size, \
+                 int       axis_size, \
+                 int       outer_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
+ \
+    Image ref_i = create_image_from_image2d(ref, 4); \
+    Image update_i = create_image_from_image2d(update, 4); \
+    Image indices_i = create_image_from_image2d(indices, 4); \
+    Image output_i = create_image_from_image2d(output, 4); \
+ \
+    dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \
+    dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \
+    dtype data = ref_ptr[0]; \
+    if (coord.y < outer_size) \
+    { \
+        dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \
+        int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \
+        for(int x = 0; x < axis_size; x ++) \
+        { \
+            int offset = indices_ptr[x]; \
+            if (offset == coord.x) \
+            { \
+                data += update_ptr[x]; \
+            } \
+        } \
+    } \
+ \
+    output_ptr[0] = data; \
+}
+SE_ADD_AXIS0_32BITS_IMPL(F32_I32_F32toF32, float)
+SE_ADD_AXIS0_32BITS_IMPL(I32_I32_I32toI32, int)
+
+#define SE_ADD_AXIS0_16BITS_IMPL(name, dtype, conv_func) \
+__kernel void scatter_elements_add_axis0_##name \
+    ( \
+    __read_only  image2d_t ref, \
+    __read_only  image2d_t indices, \
+    __read_only  image2d_t update, \
+    __write_only image2d_t output, \
+                 int       axis, \
+                 int       reduction, \
+                 float     ref_scale, \
+                 float     ref_tail, \
+                 float     update_scale, \
+                 float     update_tail, \
+                 float     output_zp, \
+                 int       inner_size, \
+                 int       axis_size, \
+                 int       outer_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
+ \
+    Image ref_i = create_image_from_image2d(ref, 2); \
+    Image update_i = create_image_from_image2d(update, 2); \
+    Image indices_i = create_image_from_image2d(indices, 4); \
+    Image output_i = create_image_from_image2d(output, 2); \
+ \
+    dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \
+    dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
+    if (coord.y < outer_size) \
+    { \
+        dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \
+        int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \
+        for(int x = 0; x < axis_size; x ++) \
+        { \
+            int offset = indices_ptr[x]; \
+            if (offset == coord.x) \
+            { \
+                data += conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \
+            } \
+        } \
+    } \
+ \
+    output_ptr[0] = data; \
+}
+SE_ADD_AXIS0_16BITS_IMPL(I16_I32_I16toI16,    short,  convert_short_rte)
+SE_ADD_AXIS0_16BITS_IMPL(F16_I32_F16toF16,    short,  convert_short)
+SE_ADD_AXIS0_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)
+
+#define SE_ADD_AXIS0_8BITS_IMPL(name, dtype, conv_func) \
+__kernel void scatter_elements_add_axis0_##name \
+    ( \
+    __read_only  image2d_t ref, \
+    __read_only  image2d_t indices, \
+    __read_only  image2d_t update, \
+    __write_only image2d_t output, \
+                 int       axis, \
+                 int       reduction, \
+                 float     ref_scale, \
+                 float     ref_tail, \
+                 float     update_scale, \
+                 float     update_tail, \
+                 float     output_zp, \
+                 int       inner_size, \
+                 int       axis_size, \
+                 int       outer_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
+ \
+    Image ref_i = create_image_from_image2d(ref, 1); \
+    Image update_i = create_image_from_image2d(update, 1); \
+    Image indices_i = create_image_from_image2d(indices, 4); \
+    Image output_i = create_image_from_image2d(output, 1); \
+ \
+    dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \
+    dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
+    if (coord.y < outer_size) \
+    { \
+        dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \
+        int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \
+        for(int x = 0; x < axis_size; x ++) \
+        { \
+            int offset = indices_ptr[x]; \
+            if (offset == coord.x) \
+            { \
+                data += conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \
+            } \
+        } \
+    } \
+ \
+    output_ptr[0] = data; \
+}
+SE_ADD_AXIS0_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)
+SE_ADD_AXIS0_8BITS_IMPL(I8_I32_I8toI8, char,  convert_char)
+
+#define SE_ADD_AXIS1_32BITS_IMPL(name, dtype) \
+__kernel void scatter_elements_add_axis1_##name \
+    ( \
+    __read_only  image2d_array_t ref, \
+    __read_only  image2d_array_t indices, \
+    __read_only  image2d_array_t update, \
+    __write_only image2d_array_t output, \
+                 int             axis, \
+                 int             reduction, \
+                 float           ref_scale, \
+                 float           ref_tail, \
+                 float           update_scale, \
+                 float           update_tail, \
+                 float           output_zp, \
+                 int             inner_size, \
+                 int             axis_size, \
+                 int             outer_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    Tensor ref_i = create_tensor_from_image2d_array(ref, 4); \
+    Tensor update_i = create_tensor_from_image2d_array(update, 4); \
+    Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \
+    Tensor output_i = create_tensor_from_image2d_array(output, 4); \
+ \
+    dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \
+    dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \
+    dtype data = ref_ptr[0]; \
+    if (coord.x < inner_size && coord.z < outer_size) \
+    { \
+        dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \
+        int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \
+        for(int y = 0; y < axis_size; y ++) \
+        { \
+            int offset = indices_ptr[y * inner_size]; \
+            if (offset == coord.y) \
+            { \
+                data += update_ptr[y * inner_size]; \
+            } \
+        } \
+    } \
+ \
+    output_ptr[0] = data; \
+}
+SE_ADD_AXIS1_32BITS_IMPL(F32_I32_F32toF32, float)
+SE_ADD_AXIS1_32BITS_IMPL(I32_I32_I32toI32, int)
+
+#define SE_ADD_AXIS1_16BITS_IMPL(name, dtype, conv_func) \
+__kernel void scatter_elements_add_axis1_##name \
+    ( \
+    __read_only  image2d_array_t ref, \
+    __read_only  image2d_array_t indices, \
+    __read_only  image2d_array_t update, \
+    __write_only image2d_array_t output, \
+                 int             axis, \
+                 int             reduction, \
+                 float           ref_scale, \
+                 float           ref_tail, \
+                 float           update_scale, \
+                 float           update_tail, \
+                 float           output_zp, \
+                 int             inner_size, \
+                 int             axis_size, \
+                 int             outer_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    Tensor ref_i = create_tensor_from_image2d_array(ref, 2); \
+    Tensor update_i = create_tensor_from_image2d_array(update, 2); \
+    Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \
+    Tensor output_i = create_tensor_from_image2d_array(output, 2); \
+ \
+    dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \
+    dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
+    if (coord.x < inner_size && coord.z < outer_size) \
+    { \
+        dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \
+        int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \
+        for(int y = 0; y < axis_size; y ++) \
+        { \
+            int offset = indices_ptr[y * inner_size]; \
+            if (offset == coord.y) \
+            { \
+                data += conv_func(convert_float(update_ptr[y * inner_size]) \
+                            * update_scale + update_tail + output_zp); \
+            } \
+        } \
+    } \
+ \
+    output_ptr[0] = data; \
+}
+SE_ADD_AXIS1_16BITS_IMPL(I16_I32_I16toI16,    short,  convert_short_rte)
+SE_ADD_AXIS1_16BITS_IMPL(F16_I32_F16toF16,    short,  convert_short)
+SE_ADD_AXIS1_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)
+
+#define SE_ADD_AXIS1_8BITS_IMPL(name, dtype, conv_func) \
+__kernel void scatter_elements_add_axis1_##name \
+    ( \
+    __read_only  image2d_array_t ref, \
+    __read_only  image2d_array_t indices, \
+    __read_only  image2d_array_t update, \
+    __write_only image2d_array_t output, \
+                 int             axis, \
+                 int             reduction, \
+                 float           ref_scale, \
+                 float           ref_tail, \
+                 float           update_scale, \
+                 float           update_tail, \
+                 float           output_zp, \
+                 int             inner_size, \
+                 int             axis_size, \
+                 int             outer_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    Tensor ref_i = create_tensor_from_image2d_array(ref, 1); \
+    Tensor update_i = create_tensor_from_image2d_array(update, 1); \
+    Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \
+    Tensor output_i = create_tensor_from_image2d_array(output, 1); \
+ \
+    dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \
+    dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
+    if (coord.x < inner_size && coord.z < outer_size) \
+    { \
+        dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \
+        int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \
+        for(int y = 0; y < axis_size; y ++) \
+        { \
+            int offset = indices_ptr[y * inner_size]; \
+            if (offset == coord.y) \
+            { \
+                data += conv_func(convert_float(update_ptr[y * inner_size]) \
+                                * update_scale + update_tail + output_zp); \
+            } \
+        } \
+    } \
+ \
+    output_ptr[0] = data; \
+}
+SE_ADD_AXIS1_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)
+SE_ADD_AXIS1_8BITS_IMPL(I8_I32_I8toI8, char,  convert_char)
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements_mul.cl b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements_mul.cl
new file mode 100644
index 0000000..46e938c
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/scatter_elements_mul.cl
@@ -0,0 +1,292 @@
+
+#define SE_MUL_AXIS0_32BITS_IMPL(name, dtype) \
+__kernel void scatter_elements_mul_axis0_##name \
+    ( \
+    __read_only  image2d_t ref, \
+    __read_only  image2d_t indices, \
+    __read_only  image2d_t update, \
+    __write_only image2d_t output, \
+                 int       axis, \
+                 int       reduction, \
+                 float     ref_scale, \
+                 float     ref_tail, \
+                 float     update_scale, \
+                 float     update_tail, \
+                 float     output_zp, \
+                 int       inner_size, \
+                 int       axis_size, \
+                 int       outer_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
+ \
+    Image ref_i = create_image_from_image2d(ref, 4); \
+    Image update_i = create_image_from_image2d(update, 4); \
+    Image indices_i = create_image_from_image2d(indices, 4); \
+    Image output_i = create_image_from_image2d(output, 4); \
+ \
+    dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \
+    dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \
+    dtype data = ref_ptr[0]; \
+    if (coord.y < outer_size) \
+    { \
+        dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \
+        int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \
+        for(int x = 0; x < axis_size; x ++) \
+        { \
+            int offset = indices_ptr[x]; \
+            if (offset == coord.x) \
+            { \
+                data *= update_ptr[x]; \
+            } \
+        } \
+    } \
+ \
+    output_ptr[0] = data; \
+}
+SE_MUL_AXIS0_32BITS_IMPL(F32_I32_F32toF32, float)
+SE_MUL_AXIS0_32BITS_IMPL(I32_I32_I32toI32, int)
+
+#define SE_MUL_AXIS0_16BITS_IMPL(name, dtype, conv_func) \
+__kernel void scatter_elements_mul_axis0_##name \
+    ( \
+    __read_only  image2d_t ref, \
+    __read_only  image2d_t indices, \
+    __read_only  image2d_t update, \
+    __write_only image2d_t output, \
+                 int       axis, \
+                 int       reduction, \
+                 float     ref_scale, \
+                 float     ref_tail, \
+                 float     update_scale, \
+                 float     update_tail, \
+                 float     output_zp, \
+                 int       inner_size, \
+                 int       axis_size, \
+                 int       outer_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
+ \
+    Image ref_i = create_image_from_image2d(ref, 2); \
+    Image update_i = create_image_from_image2d(update, 2); \
+    Image indices_i = create_image_from_image2d(indices, 4); \
+    Image output_i = create_image_from_image2d(output, 2); \
+ \
+    dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \
+    dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
+    if (coord.y < outer_size) \
+    { \
+        dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \
+        int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \
+        for(int x = 0; x < axis_size; x ++) \
+        { \
+            int offset = indices_ptr[x]; \
+            if (offset == coord.x) \
+            { \
+                data *= conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \
+            } \
+        } \
+    } \
+ \
+    output_ptr[0] = data; \
+}
+SE_MUL_AXIS0_16BITS_IMPL(I16_I32_I16toI16,    short,  convert_short_rte)
+SE_MUL_AXIS0_16BITS_IMPL(F16_I32_F16toF16,    short,  convert_short)
+SE_MUL_AXIS0_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)
+
+#define SE_MUL_AXIS0_8BITS_IMPL(name, dtype, conv_func) \
+__kernel void scatter_elements_mul_axis0_##name \
+    ( \
+    __read_only  image2d_t ref, \
+    __read_only  image2d_t indices, \
+    __read_only  image2d_t update, \
+    __write_only image2d_t output, \
+                 int       axis, \
+                 int       reduction, \
+                 float     ref_scale, \
+                 float     ref_tail, \
+                 float     update_scale, \
+                 float     update_tail, \
+                 float     output_zp, \
+                 int       inner_size, \
+                 int       axis_size, \
+                 int       outer_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \
+ \
+    Image ref_i = create_image_from_image2d(ref, 1); \
+    Image update_i = create_image_from_image2d(update, 1); \
+    Image indices_i = create_image_from_image2d(indices, 4); \
+    Image output_i = create_image_from_image2d(output, 1); \
+ \
+    dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \
+    dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
+    if (coord.y < outer_size) \
+    { \
+        dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \
+        int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \
+        for(int x = 0; x < axis_size; x ++) \
+        { \
+            int offset = indices_ptr[x]; \
+            if (offset == coord.x) \
+            { \
+                data *= conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \
+            } \
+        } \
+    } \
+ \
+    output_ptr[0] = data; \
+}
+SE_MUL_AXIS0_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)
+SE_MUL_AXIS0_8BITS_IMPL(I8_I32_I8toI8, char,  convert_char)
+
+#define SE_MUL_AXIS1_32BITS_IMPL(name, dtype) \
+__kernel void scatter_elements_mul_axis1_##name \
+    ( \
+    __read_only  image2d_array_t ref, \
+    __read_only  image2d_array_t indices, \
+    __read_only  image2d_array_t update, \
+    __write_only image2d_array_t output, \
+                 int             axis, \
+                 int             reduction, \
+                 float           ref_scale, \
+                 float           ref_tail, \
+                 float           update_scale, \
+                 float           update_tail, \
+                 float           output_zp, \
+                 int             inner_size, \
+                 int             axis_size, \
+                 int             outer_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    Tensor ref_i = create_tensor_from_image2d_array(ref, 4); \
+    Tensor update_i = create_tensor_from_image2d_array(update, 4); \
+    Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \
+    Tensor output_i = create_tensor_from_image2d_array(output, 4); \
+ \
+    dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \
+    dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \
+    dtype data = ref_ptr[0]; \
+    if (coord.x < inner_size && coord.z < outer_size) \
+    { \
+        dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \
+        int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \
+        for(int y = 0; y < axis_size; y ++) \
+        { \
+            int offset = indices_ptr[y * inner_size]; \
+            if (offset == coord.y) \
+            { \
+                data *= update_ptr[y * inner_size]; \
+            } \
+        } \
+    } \
+ \
+    output_ptr[0] = data; \
+}
+SE_MUL_AXIS1_32BITS_IMPL(F32_I32_F32toF32, float)
+SE_MUL_AXIS1_32BITS_IMPL(I32_I32_I32toI32, int)
+
+#define SE_MUL_AXIS1_16BITS_IMPL(name, dtype, conv_func) \
+__kernel void scatter_elements_mul_axis1_##name \
+    ( \
+    __read_only  image2d_array_t ref, \
+    __read_only  image2d_array_t indices, \
+    __read_only  image2d_array_t update, \
+    __write_only image2d_array_t output, \
+                 int             axis, \
+                 int             reduction, \
+                 float           ref_scale, \
+                 float           ref_tail, \
+                 float           update_scale, \
+                 float           update_tail, \
+                 float           output_zp, \
+                 int             inner_size, \
+                 int             axis_size, \
+                 int             outer_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    Tensor ref_i = create_tensor_from_image2d_array(ref, 2); \
+    Tensor update_i = create_tensor_from_image2d_array(update, 2); \
+    Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \
+    Tensor output_i = create_tensor_from_image2d_array(output, 2); \
+ \
+    dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \
+    dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
+    if (coord.x < inner_size && coord.z < outer_size) \
+    { \
+        dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \
+        int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \
+        for(int y = 0; y < axis_size; y ++) \
+        { \
+            int offset = indices_ptr[y * inner_size]; \
+            if (offset == coord.y) \
+            { \
+                data *= conv_func(convert_float(update_ptr[y * inner_size]) \
+                            * update_scale + update_tail + output_zp); \
+            } \
+        } \
+    } \
+ \
+    output_ptr[0] = data; \
+}
+SE_MUL_AXIS1_16BITS_IMPL(I16_I32_I16toI16,    short,  convert_short_rte)
+SE_MUL_AXIS1_16BITS_IMPL(F16_I32_F16toF16,    short,  convert_short)
+SE_MUL_AXIS1_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)
+
+#define SE_MUL_AXIS1_8BITS_IMPL(name, dtype, conv_func) \
+__kernel void scatter_elements_mul_axis1_##name \
+    ( \
+    __read_only  image2d_array_t ref, \
+    __read_only  image2d_array_t indices, \
+    __read_only  image2d_array_t update, \
+    __write_only image2d_array_t output, \
+                 int             axis, \
+                 int             reduction, \
+                 float           ref_scale, \
+                 float           ref_tail, \
+                 float           update_scale, \
+                 float           update_tail, \
+                 float           output_zp, \
+                 int             inner_size, \
+                 int             axis_size, \
+                 int             outer_size \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    Tensor ref_i = create_tensor_from_image2d_array(ref, 1); \
+    Tensor update_i = create_tensor_from_image2d_array(update, 1); \
+    Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \
+    Tensor output_i = create_tensor_from_image2d_array(output, 1); \
+ \
+    dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \
+    dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \
+    if (coord.x < inner_size && coord.z < outer_size) \
+    { \
+        dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \
+        int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \
+        for(int y = 0; y < axis_size; y ++) \
+        { \
+            int offset = indices_ptr[y * inner_size]; \
+            if (offset == coord.y) \
+            { \
+                data *= conv_func(convert_float(update_ptr[y * inner_size]) \
+                                * update_scale + update_tail + output_zp); \
+            } \
+        } \
+    } \
+ \
+    output_ptr[0] = data; \
+}
+SE_MUL_AXIS1_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)
+SE_MUL_AXIS1_8BITS_IMPL(I8_I32_I8toI8, char,  convert_char)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/bucketize.vx b/src/tim/vx/internal/src/libnnext/ops/vx/bucketize.vx
new file mode 100644
index 0000000..9b7e5c9
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/bucketize.vx
@@ -0,0 +1,176 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniDataConvert_0_4x4;
+_viv_uniform VXC_512Bits uniDataConvert_1_4x4;
+_viv_uniform int boundaries_size_x8;
+_viv_uniform int boundaries_size;
+
+#define BUCKETIZE_16BITS_SH_IMPL(name, copy_type) \
+__kernel void bucketize_right_##name \
+    ( \
+    __read_only  image2d_t input, \
+    __read_only  image2d_t boundaries, \
+    __write_only image2d_t output \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
+ \
+    vxc_short8 data0, data1; \
+    copy_type src0, src1, dst0, dst1; \
+    vxc_ushort8 v0, v1, v2, v3, result = 0; \
+    VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src0, data0, 16); \
+ \
+    for (; coord.z < boundaries_size_x8; ) \
+    { \
+        VXC_ReadImage(data1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, src1, data1.s00000000, 16); \
+        coord.z += 8; \
+ \
+        VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
+        _viv_asm(COPY, v0, dst0, 16); \
+        v2 = sub_sat(v0, 0xFFFE); \
+        _viv_asm(COPY, src1, data1.s11111111, 16); \
+        VXC_Clamp(dst1, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
+        _viv_asm(COPY, v1, dst1, 16); \
+        v3 = sub_sat(v1, 0xFFFE); \
+ \
+        result = result + v2 + v3; \
+ \
+        _viv_asm(COPY, src1, data1.s22222222, 16); \
+        VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
+        _viv_asm(COPY, v0, dst0, 16); \
+        v2 = sub_sat(v0, 0xFFFE); \
+        _viv_asm(COPY, src1, data1.s33333333, 16); \
+        VXC_Clamp(dst1, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
+        _viv_asm(COPY, v1, dst1, 16); \
+        v3 = sub_sat(v1, 0xFFFE); \
+ \
+        result = result + v2 + v3; \
+ \
+        _viv_asm(COPY, src1, data1.s44444444, 16); \
+        VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
+        _viv_asm(COPY, v0, dst0, 16); \
+        v2 = sub_sat(v0, 0xFFFE); \
+        _viv_asm(COPY, src1, data1.s55555555, 16); \
+        VXC_Clamp(dst1, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
+        _viv_asm(COPY, v1, dst1, 16); \
+        v3 = sub_sat(v1, 0xFFFE); \
+ \
+        result = result + v2 + v3; \
+ \
+        _viv_asm(COPY, src1, data1.s66666666, 16); \
+        VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
+        _viv_asm(COPY, v0, dst0, 16); \
+        v2 = sub_sat(v0, 0xFFFE); \
+        _viv_asm(COPY, src1, data1.s77777777, 16); \
+        VXC_Clamp(dst1, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
+        _viv_asm(COPY, v1, dst1, 16); \
+        v3 = sub_sat(v1, 0xFFFE); \
+ \
+        result = result + v2 + v3; \
+    } \
+ \
+    for (; coord.z < boundaries_size; ) \
+    { \
+        VXC_ReadImage(data1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, src1, data1.s00000000, 16); \
+        coord.z ++; \
+ \
+        VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
+        _viv_asm(COPY, v0, dst0, 16); \
+        v2 = sub_sat(v0, 0xFFFE); \
+ \
+        result = result + v2; \
+    } \
+ \
+    int4 d0, d1; \
+    VXC_DP4x4(d0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniDataConvert_0_4x4); \
+    VXC_DP4x4(d1, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniDataConvert_1_4x4); \
+    coord.z = coord.x + 4; \
+ \
+    write_imagei(output, coord.xy, d0); \
+    write_imagei(output, coord.zy, d1); \
+}
+BUCKETIZE_16BITS_SH_IMPL(F16_F16toI32_2D, vxc_half8)
+BUCKETIZE_16BITS_SH_IMPL(I16_I16toI32_2D, vxc_short8)
+
+#define BUCKETIZE_8BITS_SH_IMPL(name, src_type) \
+__kernel void bucketize_right_##name \
+    ( \
+    __read_only  image2d_t input, \
+    __read_only  image2d_t boundaries, \
+    __write_only image2d_t output \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
+ \
+    src_type src0, src1, src2; \
+    vxc_uchar8 dst0, dst1, result = 0; \
+    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    for (; coord.z < boundaries_size_x8; ) \
+    { \
+        VXC_ReadImage(src1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord.z += 8; \
+ \
+        VXC_Clamp(src2, src0, src1.s00000000, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
+        _viv_asm(COPY, dst0, src2, 8); \
+        dst0 = sub_sat(dst0, 0xFE); \
+        VXC_Clamp(src2, src0, src1.s11111111, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
+        _viv_asm(COPY, dst1, src2, 8); \
+        dst1 = sub_sat(dst1, 0xFE); \
+ \
+        result = result + dst0 + dst1; \
+ \
+        VXC_Clamp(src2, src0, src1.s22222222, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
+        _viv_asm(COPY, dst0, src2, 8); \
+        dst0 = sub_sat(dst0, 0xFE); \
+        VXC_Clamp(src2, src0, src1.s33333333, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
+        _viv_asm(COPY, dst1, src2, 8); \
+        dst1 = sub_sat(dst1, 0xFE); \
+ \
+        result = result + dst0 + dst1; \
+ \
+        VXC_Clamp(src2, src0, src1.s44444444, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
+        _viv_asm(COPY, dst0, src2, 8); \
+        dst0 = sub_sat(dst0, 0xFE); \
+        VXC_Clamp(src2, src0, src1.s55555555, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
+        _viv_asm(COPY, dst1, src2, 8); \
+        dst1 = sub_sat(dst1, 0xFE); \
+ \
+        result = result + dst0 + dst1; \
+ \
+        VXC_Clamp(src2, src0, src1.s66666666, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
+        _viv_asm(COPY, dst0, src2, 8); \
+        dst0 = sub_sat(dst0, 0xFE); \
+        VXC_Clamp(src2, src0, src1.s77777777, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
+        _viv_asm(COPY, dst1, src2, 8); \
+        dst1 = sub_sat(dst1, 0xFE); \
+ \
+        result = result + dst0 + dst1; \
+    } \
+ \
+    for (; coord.z < boundaries_size; ) \
+    { \
+        VXC_ReadImage(src1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+        coord.z ++; \
+ \
+        VXC_Clamp(src2, src0, src1.s00000000, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \
+        _viv_asm(COPY, dst0, src2, 8); \
+        dst0 = sub_sat(dst0, 0xFE); \
+ \
+        result = result + dst0; \
+    } \
+ \
+    int4 d0, d1; \
+    VXC_DP4x4(d0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniDataConvert_0_4x4); \
+    VXC_DP4x4(d1, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniDataConvert_1_4x4); \
+    coord.z = coord.x + 4; \
+ \
+    write_imagei(output, coord.xy, d0); \
+    write_imagei(output, coord.zy, d1); \
+}
+BUCKETIZE_8BITS_SH_IMPL(U8_U8toI32_2D, vxc_uchar8)
+BUCKETIZE_8BITS_SH_IMPL(I8_I8toI32_2D, vxc_char8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx
index 120e37e..3562ae5 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_1.vx
@@ -98,7 +98,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
     vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \
     half4 tmpVal0, tmpVal1; \
     float alpha = scale_vari; \
-    float alpha = scale_vari * input_scale; \
+    alpha = scale_vari * input_scale; \
     bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \
     bias_val = bias_val - input_zp * alpha; \
  \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_0.vx
index 1644ecd..4b4bf87 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_0.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_0.vx
@@ -1,20 +1,14 @@
 #include "cl_viv_vx_ext.h"
 
 _viv_uniform int width;
-_viv_uniform int height;
-_viv_uniform float inv_multiplier;
-_viv_uniform int group_num;
 
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;
 _viv_uniform VXC_512Bits uniSum_X_X2_16x2;
 _viv_uniform float input_scale;
 _viv_uniform float input_scale2;
-_viv_uniform float input_zp;
 _viv_uniform float sum_x_tail;
 _viv_uniform float sum_x2_tail0;
 _viv_uniform float sum_x2_tail1;
-_viv_uniform float output_scale;
-_viv_uniform float output_zp;
 
 _viv_uniform VXC_512Bits uniSumX_16x1;
 _viv_uniform VXC_512Bits uniSumX2_16x1;
@@ -23,7 +17,7 @@ _viv_uniform VXC_512Bits uniSumX2_16x1;
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name( \
     __read_only  image2d_array_t input, \
     __write_only image2d_array_t output, \
-                 float eps, int rs_flag) \
+                 float eps, int height) \
 { \
     int gidx = get_global_id(0) << 4; \
     int lidx = get_local_id(0); \
@@ -81,7 +75,7 @@ INSTANCE_NORM_SUMS_8BITS_IMPL(I8, vxc_char16)
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name##_2D( \
     __read_only  image2d_array_t input, \
     __write_only image2d_array_t output, \
-                 float eps, int rs_flag) \
+                 float eps, int height) \
 { \
     int gidx = get_global_id(0) << 4; \
     int lidx = get_local_id(0); \
@@ -134,18 +128,62 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums
 INSTANCE_NORM_SUMS_8BITS_IMPL_2D(U8, vxc_uchar16)
 INSTANCE_NORM_SUMS_8BITS_IMPL_2D(I8, vxc_char16)
 
+__kernel void instance_norm_means
+(
+    __read_only  image2d_t sums,
+    __read_only  image2d_t bias,
+    __read_only  image2d_t scale,
+    __write_only image2d_t means,
+                 float     eps,
+                 float     in_time_out_scale,
+                 float     input_zp,
+                 float     output_scale,
+                 float     output_zp,
+                 float     inv_multiplier,
+                 int       group_num
+)
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+
+    Image sums_img = create_image_from_image2d(sums, 4);
+    float4 *sums_ptr = (float4 *)get_image_ptr_from_coord(sums_img, coord);
+
+    float alpha = read_imagef(scale, coord).x;
+    float beta = read_imagef(bias, coord).x;
+
+    float4 mean_var = sums_ptr[0];
+    for(int i = 1; i < group_num;)
+    {
+        mean_var += sums_ptr[i];
+        i ++;
+    }
+
+    mean_var *= inv_multiplier;
+    mean_var.s1 = mean_var.s1 - mean_var.s0 * mean_var.s0 + eps;
+    mean_var.s1 = rsqrt(mean_var.s1);
+
+    alpha = alpha * mean_var.y;
+
+    float4 dst;
+    dst.x = in_time_out_scale * alpha;
+    beta = (beta - alpha * mean_var.x) * output_scale + output_zp;
+    dst.y = beta - input_zp * dst.x;
+
+    Image means_img = create_image_from_image2d(means, 4);
+    float4 *means_ptr = (float4 *)get_image_ptr_from_coord(means_img, coord);
+    means_ptr[0] = dst.xyxy;
+}
+
 _viv_uniform VXC_512Bits uniDataToFP32_0_4x4;
 _viv_uniform VXC_512Bits uniDataToFP32_1_4x4;
 _viv_uniform VXC_512Bits uniDataToFP32_2_4x4;
 _viv_uniform VXC_512Bits uniDataToFP32_3_4x4;
 #define INSTANCE_NORM_8BITS_IMPL(name, src_type, dst_type) \
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \
+__kernel void instance_norm_##name( \
     __read_only  image2d_array_t input, \
-    __read_only  image2d_t       bias, \
-    __read_only  image2d_t       scale, \
-    __read_only  image2d_t       meanVari, \
+    __read_only  image2d_t       means, \
     __write_only image2d_array_t output, \
-                 float eps, int rs_flag) \
+                 int height) \
 { \
     int gidz = get_global_id(1); \
     int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \
@@ -153,26 +191,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
     int2 coord_para = (int2)(0, gidz); \
     src_type src0; \
     dst_type dst; \
-    float scale_vari, bias_val; \
-    float4 bias_f, scale_f, mean_vari = (float4)(0); \
+    float4 coef; \
  \
-    scale_f = read_imagef(scale, coord_para); \
-    bias_f = read_imagef(bias, coord_para); \
-    for(int i = 0; i < group_num; i++) \
-    { \
-        mean_vari += read_imagef(meanVari, coord_para); \
-        coord_para.x += 4; \
-    } \
-    mean_vari *= inv_multiplier; \
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
-    mean_vari.s1 = rsqrt(mean_vari.s1); \
- \
-    scale_vari = scale_f.s0 * mean_vari.s1; \
-    vxc_int4 tmpVal0, tmpVal1; \
+    coef = read_imagef(means, coord_para); \
+    int4 tmpVal0, tmpVal1; \
     float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \
-    float alpha = input_scale * output_scale * scale_vari; \
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \
-    bias_val = bias_val - input_zp * alpha; \
  \
     int8 input_desc, output_desc; \
     _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \
@@ -191,14 +214,14 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
     VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
     VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \
     VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \
-    norm = tmpData0 * alpha + bias_val; \
+    norm = tmpData0 * coef.x + coef.y; \
     tmpVal0 = convert_int4_rte(norm); \
-    norm = tmpData1 * alpha + bias_val; \
+    norm = tmpData1 * coef.x + coef.y; \
     tmpVal1 = convert_int4_rte(norm); \
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
-    norm = tmpData2 * alpha + bias_val; \
+    norm = tmpData2 * coef.x + coef.y; \
     tmpVal0 = convert_int4_rte(norm); \
-    norm = tmpData3 * alpha + bias_val; \
+    norm = tmpData3 * coef.x + coef.y; \
     tmpVal1 = convert_int4_rte(norm); \
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
     VXC_OP4_NoDest(img_store_3d, output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \
@@ -208,60 +231,46 @@ INSTANCE_NORM_8BITS_IMPL(U8_F32toU8, vxc_uchar16, vxc_uchar16)
 INSTANCE_NORM_8BITS_IMPL(I8_F32toI8, vxc_char16,  vxc_char16)
 
 #define INSTANCE_NORM_8BITS_IMPL_2D(name, src_type, dst_type) \
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \
+__kernel void instance_norm_##name##_2D( \
     __read_only  image2d_array_t input, \
-    __read_only  image2d_t       bias, \
-    __read_only  image2d_t       scale, \
-    __read_only  image2d_t       meanVari, \
+    __read_only  image2d_t       means, \
     __write_only image2d_array_t output, \
-                 float eps, int rs_flag) \
+                 int height) \
 { \
     int gidz = get_global_id(1); \
     int gidy = gidz * height; \
-    int2 coord = (int2)(get_global_id(0), gidy); \
+    int4 coord; \
     int2 coord_para = (int2)(0, gidz); \
     int endH = gidy + height; \
     src_type src0; \
     dst_type dst; \
-    float scale_vari, bias_val; \
-    float4 bias_f, scale_f, mean_vari = (float4)(0); \
+    float4 coef; \
  \
-    scale_f = read_imagef(scale, coord_para); \
-    bias_f = read_imagef(bias, coord_para); \
-    for(int i = 0; i < group_num; i++) \
-    { \
-        mean_vari += read_imagef(meanVari, coord_para); \
-        coord_para.x += 4; \
-    } \
-    mean_vari *= inv_multiplier; \
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
-    mean_vari.s1 = rsqrt(mean_vari.s1); \
- \
-    scale_vari = scale_f.s0 * mean_vari.s1; \
-    vxc_int4 tmpVal0, tmpVal1; \
+    coef = read_imagef(means, coord_para); \
+    int4 tmpVal0, tmpVal1; \
     float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \
-    float alpha = input_scale * output_scale * scale_vari; \
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \
-    bias_val = bias_val - input_zp * alpha; \
  \
-    for(; coord.y < endH; coord.y++) \
+    coord = (int4)(get_global_id(0), gidy, gidy - 1, gidy - 1); \
+ \
+    for(; coord.y < endH; ) \
     { \
-    VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    coord.yz++; \
     VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
     VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
     VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \
     VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \
-    norm = tmpData0 * alpha + bias_val; \
+    norm = tmpData0 * coef.x + coef.y; \
     tmpVal0 = convert_int4_rte(norm); \
-    norm = tmpData1 * alpha + bias_val; \
+    norm = tmpData1 * coef.x + coef.y; \
     tmpVal1 = convert_int4_rte(norm); \
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
-    norm = tmpData2 * alpha + bias_val; \
+    norm = tmpData2 * coef.x + coef.y; \
     tmpVal0 = convert_int4_rte(norm); \
-    norm = tmpData3 * alpha + bias_val; \
+    norm = tmpData3 * coef.x + coef.y; \
     tmpVal1 = convert_int4_rte(norm); \
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(output, coord.xz, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
     } \
 }
 INSTANCE_NORM_8BITS_IMPL_2D(U8_F32toU8, vxc_uchar16, vxc_uchar16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_1.vx
index 82d1704..322dac5 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_1.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_1.vx
@@ -1,11 +1,5 @@
 #include "cl_viv_vx_ext.h"
 
-_viv_uniform int width;
-_viv_uniform int height;
-_viv_uniform float inv_multiplier;
-_viv_uniform int group_num;
-_viv_uniform float input_scale;
-_viv_uniform float input_zp;
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;
 
 _viv_uniform VXC_512Bits uniDataToFP32_0_4x4;
@@ -14,13 +8,11 @@ _viv_uniform VXC_512Bits uniDataToFP32_2_4x4;
 _viv_uniform VXC_512Bits uniDataToFP32_3_4x4;
 
 #define INSTANCE_NORM_8_TO_F16_IMPL(name, src_type) \
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \
+__kernel void instance_norm_##name( \
     __read_only  image2d_array_t input, \
-    __read_only  image2d_t       bias, \
-    __read_only  image2d_t       scale, \
-    __read_only  image2d_t       meanVari, \
+    __read_only  image2d_t       means, \
     __write_only image2d_array_t output, \
-                 float eps, int rs_flag) \
+                 int height) \
 { \
     int gidz = get_global_id(1); \
     int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \
@@ -28,25 +20,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
     src_type src0; \
     vxc_short8 outval; \
     vxc_half8 dst; \
-    float scale_vari, bias_val; \
-    float4 bias_f, scale_f, mean_vari = (float4)(0); \
+    float4 coef; \
  \
-    scale_f = read_imagef(scale, coord_para.xy); \
-    bias_f = read_imagef(bias, coord_para.xy); \
-    for(int i = 0; i < group_num; i++) \
-    { \
-        mean_vari += read_imagef(meanVari, coord_para.xy); \
-        coord_para.x += 4; \
-    } \
-    mean_vari *= inv_multiplier; \
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
-    mean_vari.s1 = rsqrt(mean_vari.s1); \
-    scale_vari = scale_f.s0 * mean_vari.s1; \
+    coef = read_imagef(means, coord_para.xy); \
     float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \
     half4 tmpVal0, tmpVal1; \
-    float alpha = scale_vari * input_scale; \
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \
-    bias_val = bias_val - input_zp * alpha; \
  \
     coord_para = coord; \
     int8 input_desc, output_desc; \
@@ -67,17 +45,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
     VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
     VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \
     VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \
-    norm = alpha * tmpData0 + bias_val; \
+    norm = tmpData0 * coef.x + coef.y; \
     _viv_asm(CONV, tmpVal0, norm); \
-    norm = alpha * tmpData1 + bias_val; \
+    norm = tmpData1 * coef.x + coef.y; \
     _viv_asm(CONV, tmpVal1, norm); \
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
     _viv_asm(COPY, outval, dst, 16); \
     VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \
     coord_para.x += 8; \
-    norm = alpha * tmpData2 + bias_val; \
+    norm = tmpData2 * coef.x + coef.y; \
     _viv_asm(CONV, tmpVal0, norm); \
-    norm = alpha * tmpData3 + bias_val; \
+    norm = tmpData3 * coef.x + coef.y; \
     _viv_asm(CONV, tmpVal1, norm); \
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
     _viv_asm(COPY, outval, dst, 16); \
@@ -88,13 +66,11 @@ INSTANCE_NORM_8_TO_F16_IMPL(U8_F32toF16, vxc_uchar16)
 INSTANCE_NORM_8_TO_F16_IMPL(I8_F32toF16, vxc_char16)
 
 #define INSTANCE_NORM_8_TO_F16_IMPL_2D(name, src_type) \
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \
+__kernel void instance_norm_##name##_2D( \
     __read_only  image2d_array_t input, \
-    __read_only  image2d_t       bias, \
-    __read_only  image2d_t       scale, \
-    __read_only  image2d_t       meanVari, \
+    __read_only  image2d_t       means, \
     __write_only image2d_array_t output, \
-                 float eps, int rs_flag) \
+                 int height) \
 { \
     int gidz = get_global_id(1); \
     int gidy = gidz * height; \
@@ -104,26 +80,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
     src_type src0; \
     vxc_short8 outval; \
     vxc_half8 dst; \
-    float scale_vari, bias_val; \
-    float4 bias_f, scale_f, mean_vari = (float4)(0); \
+    float4 coef; \
  \
-    scale_f = read_imagef(scale, coord_para.xy); \
-    bias_f = read_imagef(bias, coord_para.xy); \
-    for(int i = 0; i < group_num; i++) \
-    { \
-        mean_vari += read_imagef(meanVari, coord_para.xy); \
-        coord_para.x += 4; \
-    } \
-    mean_vari *= inv_multiplier; \
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
-    mean_vari.s1 = rsqrt(mean_vari.s1); \
- \
-    scale_vari = scale_f.s0 * mean_vari.s1; \
+    coef = read_imagef(means, coord_para.xy); \
     float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \
     half4 tmpVal0, tmpVal1; \
-    float alpha = scale_vari * input_scale; \
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \
-    bias_val = bias_val - input_zp * alpha; \
     for(; coord.y < endH;) \
     { \
     VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
@@ -133,17 +94,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
     VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
     VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \
     VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \
-    norm = alpha * tmpData0 + bias_val; \
+    norm = tmpData0 * coef.x + coef.y; \
     _viv_asm(CONV, tmpVal0, norm); \
-    norm = alpha * tmpData1 + bias_val; \
+    norm = tmpData1 * coef.x + coef.y; \
     _viv_asm(CONV, tmpVal1, norm); \
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
     _viv_asm(COPY, outval, dst, 16); \
     VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
     coord_para.x += 8; \
-    norm = alpha * tmpData2 + bias_val; \
+    norm = tmpData2 * coef.x + coef.y; \
     _viv_asm(CONV, tmpVal0, norm); \
-    norm = alpha * tmpData3 + bias_val; \
+    norm = tmpData3 * coef.x + coef.y; \
     _viv_asm(CONV, tmpVal1, norm); \
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
     _viv_asm(COPY, outval, dst, 16); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_2.vx
index 75221f4..2289baf 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_2.vx
@@ -1,28 +1,21 @@
 #include "cl_viv_vx_ext.h"
 
 _viv_uniform int width;
-_viv_uniform int height;
-_viv_uniform float inv_multiplier;
-_viv_uniform int group_num;
 _viv_uniform VXC_512Bits uniDataToFP32_0_4x4;
 _viv_uniform VXC_512Bits uniDataToFP32_1_4x4;
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;
 _viv_uniform VXC_512Bits uniSum_X_X2_8x2;
 _viv_uniform float input_scale;
 _viv_uniform float input_scale2;
-_viv_uniform float input_zp;
 _viv_uniform float sum_x_tail;
 _viv_uniform float sum_x2_tail0;
 _viv_uniform float sum_x2_tail1;
 
-_viv_uniform float output_scale;
-_viv_uniform float output_zp;
-
 #define INSTANCE_NORM_SUMS_16BITS_IMPL(name, src_type) \
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name( \
     __read_only  image2d_array_t input, \
     __write_only image2d_array_t output, \
-                 float eps, int rs_flag) \
+                 float eps, int height) \
 { \
     int gidx = get_global_id(0) << 3; \
     int lidx = get_local_id(0); \
@@ -87,7 +80,7 @@ INSTANCE_NORM_SUMS_16BITS_IMPL(I16, vxc_short8)
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name##_2D( \
     __read_only  image2d_array_t input, \
     __write_only image2d_array_t output, \
-                 float eps, int rs_flag) \
+                 float eps, int height) \
 { \
     int gidx = get_global_id(0) << 3; \
     int lidx = get_local_id(0); \
@@ -146,13 +139,11 @@ INSTANCE_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_half8)
 INSTANCE_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8)
 
 #define INSTANCE_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \
+__kernel void instance_norm_##name( \
     __read_only  image2d_array_t input, \
-    __read_only  image2d_t       bias, \
-    __read_only  image2d_t       scale, \
-    __read_only  image2d_t       meanVari, \
+    __read_only  image2d_t       means, \
     __write_only image2d_array_t output, \
-                 float eps, int rs_flag) \
+                 int height) \
 { \
     int gidz = get_global_id(1); \
     int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \
@@ -160,28 +151,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
     int4 coord_para = (int4)(0, gidz, 0, 0); \
     vxc_short8 src0; \
     src_type in_h; \
-    float scale_vari, bias_val; \
-    float4 bias_f, scale_f, mean_vari = (float4)(0); \
+    float4 coef; \
  \
-    scale_f = read_imagef(scale, coord_para.xy); \
-    bias_f = read_imagef(bias, coord_para.xy); \
+    coef = read_imagef(means, coord_para.xy); \
  \
-    for(int i = 0; i < group_num; i++) \
-    { \
-        mean_vari += read_imagef(meanVari, coord_para.xy); \
-        coord_para.x += 4; \
-    } \
-    mean_vari *= inv_multiplier; \
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
-    mean_vari.s1 = rsqrt(mean_vari.s1); \
- \
-    scale_vari = scale_f.s0 * mean_vari.s1; \
-    float alpha = input_scale * output_scale * scale_vari; \
     float4  tmpData0, tmpData1; \
     copy_type outval; \
     conv_type tmpVal0, tmpVal1; \
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \
-    bias_val = bias_val - input_zp * alpha; \
     dst_type dst; \
  \
     int8 input_desc, output_desc; \
@@ -204,9 +180,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
     VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
  \
     float4 norm; \
-    norm = alpha * tmpData0 + bias_val; \
+    norm = tmpData0 * coef.x + coef.y; \
     _viv_asm(CONV_RTE, tmpVal0, norm); \
-    norm = alpha * tmpData1 + bias_val; \
+    norm = tmpData1 * coef.x + coef.y; \
     _viv_asm(CONV_RTE, tmpVal1, norm); \
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
     _viv_asm(COPY, outval, dst, 16); \
@@ -221,13 +197,11 @@ INSTANCE_NORM_16BITS_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4
 INSTANCE_NORM_16BITS_IMPL(I16_F32toF16, vxc_short8, vxc_half8,  vxc_short8, half4)
 
 #define INSTANCE_NORM_16BITS_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \
+__kernel void instance_norm_##name##_2D( \
     __read_only  image2d_array_t input, \
-    __read_only  image2d_t       bias, \
-    __read_only  image2d_t       scale, \
-    __read_only  image2d_t       meanVari, \
+    __read_only  image2d_t       means, \
     __write_only image2d_array_t output, \
-                 float eps, int rs_flag) \
+                 int height) \
 { \
     int gidz = get_global_id(1); \
     int gidy = gidz * height; \
@@ -236,28 +210,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
     int endH = gidy + height; \
     vxc_short8 src0; \
     src_type in_h; \
-    float scale_vari, bias_val; \
-    float4 bias_f, scale_f, mean_vari = (float4)(0); \
+    float4 coef; \
  \
-    scale_f = read_imagef(scale, coord_para.xy); \
-    bias_f = read_imagef(bias, coord_para.xy); \
+    coef = read_imagef(means, coord_para.xy); \
  \
-    for(int i = 0; i < group_num; i++) \
-    { \
-        mean_vari += read_imagef(meanVari, coord_para.xy); \
-        coord_para.x += 4; \
-    } \
-    mean_vari *= inv_multiplier; \
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \
-    mean_vari.s1 = rsqrt(mean_vari.s1); \
- \
-    scale_vari = scale_f.s0 * mean_vari.s1; \
-    float alpha = input_scale * output_scale * scale_vari; \
     float4  tmpData0, tmpData1; \
     copy_type outval; \
     conv_type tmpVal0, tmpVal1; \
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \
-    bias_val = bias_val - input_zp * alpha; \
     dst_type dst; \
  \
     for(; coord.y < endH; coord.y++) \
@@ -268,9 +227,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
     VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \
     VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \
     float4 norm; \
-    norm = alpha * tmpData0 + bias_val; \
+    norm = tmpData0 * coef.x + coef.y; \
     _viv_asm(CONV, tmpVal0, norm); \
-    norm = alpha * tmpData1 + bias_val; \
+    norm = tmpData1 * coef.x + coef.y; \
     _viv_asm(CONV, tmpVal1, norm); \
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \
     _viv_asm(COPY, outval, dst, 16); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_3.vx b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_3.vx
index 19f335b..078cca3 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_3.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/instance_normalization_3.vx
@@ -1,15 +1,13 @@
 #include "cl_viv_vx_ext.h"
 
 _viv_uniform int width;
-_viv_uniform int height;
-_viv_uniform float inv_multiplier;
-_viv_uniform int group_num;
+
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
 _viv_uniform VXC_512Bits uniExtractOddData_2x8;
 
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_BF16(
-    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)
+    image2d_array_t input, image2d_array_t output, float eps, int height)
 {
     int gidx = get_global_id(0) << 3;
     int lidx = get_local_id(0);
@@ -70,7 +68,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums
 }
 
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_BF16_2D(
-    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)
+    image2d_array_t input, image2d_array_t output, float eps, int height)
 {
     int gidx = get_global_id(0) << 3;
     int lidx = get_local_id(0);
@@ -129,36 +127,21 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums
     }
 }
 
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16_F32toBF16(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int rsFlg)
+__kernel void instance_norm_BF16_F32toBF16(
+    __read_only  image2d_array_t input,
+    __read_only  image2d_t       means,
+    __write_only image2d_array_t output,
+                 int height)
 {
     int gidz = get_global_id(1);
     int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);
     int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);
     vxc_short8 src0, src1, src2;
-    float scale_vari, bias_val;
-    float4 mean_vari = (float4)(0);
+    float4 coef;
 
-    Image img3 = create_image_from_image2d(meanVari, 4);
-    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, (int2)(0, gidz));
-    __global float4* vari_ptr = (__global float4*)sumVari_ptr;
+    coef = read_imagef(means, coord.yz);
 
-    float sval = read_imagef(scale, coord.yz).x;
-    float bval = read_imagef(bias, coord.yz).x;
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += vari_ptr[i];
-    }
-
-    mean_vari *= inv_multiplier;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    scale_vari = sval * mean_vari.s1;
     float4  tmpData0, tmpData1;
-    bias_val = (bval - scale_vari * mean_vari.s0);
 
     int8 input_desc, output_desc;
     _viv_asm(COPY, input_desc, input, sizeof(input_desc));
@@ -171,6 +154,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
 
     for(coord.y = 0; coord.y < height; coord.y++)
     {
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
     VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \
                     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
     coord_in.y ++;
@@ -182,9 +166,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
     _viv_asm(COPY, tmpData1, src2, 16);
 
     float4 norm;
-    norm = scale_vari * tmpData0 + bias_val;
+    norm = tmpData0 * coef.x + coef.y;
     _viv_asm(COPY, src0, norm, 16);
-    norm = scale_vari * tmpData1 + bias_val;
+    norm = tmpData1 * coef.x + coef.y;
     _viv_asm(COPY, src1, norm, 16);
     VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
     VXC_OP4_NoDest(img_store_3d, output, coord, src2, \
@@ -192,41 +176,27 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
     }
 }
 
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16F32toBF16_2D(
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
-    image2d_array_t output, float eps, int rsFlg)
+__kernel void instance_norm_BF16_F32toBF16_2D(
+    __read_only  image2d_array_t input,
+    __read_only  image2d_t       means,
+    __write_only image2d_array_t output,
+                 int height)
 {
     int gidz = get_global_id(1);
     int gidy = gidz * height;
     int2 coord = (int2)(get_global_id(0), gidy);
-    int2 coord_para = (int2)(gidz, 0);
+    int2 coord_para = (int2)(0, gidz);
     int endH = gidy + height;
     vxc_short8 src0, src1, src2;
-    float scale_vari, bias_val;
-    float4 mean_vari = (float4)(0);
+    float4 coef;
 
-    Image img3 = create_image_from_image2d(meanVari, 4);
-    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);
-    __global float4* vari_ptr = (__global float4*)sumVari_ptr;
+    coef = read_imagef(means, coord_para);
 
-    float sval = read_imagef(scale, coord_para.yx).x;
-    float bval = read_imagef(bias, coord_para.yx).x;
-
-    for(int i = 0; i < group_num; i++)
-    {
-        mean_vari += vari_ptr[i];
-    }
-
-    mean_vari *= inv_multiplier;
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;
-    mean_vari.s1 = rsqrt(mean_vari.s1);
-
-    scale_vari = sval * mean_vari.s1;
     float4  tmpData0, tmpData1;
-    bias_val = (bval - scale_vari * mean_vari.s0);
 
     for(; coord.y < endH; coord.y++)
     {
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
     VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
         VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
     VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
@@ -237,9 +207,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
     _viv_asm(COPY, tmpData1, src2, 16);
 
     float4 norm;
-    norm = scale_vari * tmpData0 + bias_val;
+    norm = tmpData0 * coef.x + coef.y;
     _viv_asm(COPY, src0, norm, 16);
-    norm = scale_vari * tmpData1 + bias_val;
+    norm = tmpData1 * coef.x + coef.y;
     _viv_asm(COPY, src1, norm, 16);
     VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
     VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx
index 95d9c87..672c61f 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx
@@ -150,7 +150,7 @@ _viv_uniform int inputZP;
                     VXC_Vstore3(dst_ptr, 0, dst.s012); \
                 break; \
                 case 4: \
-                    VXC_Vstore4(dst_ptr, 0, dst.0123); \
+                    VXC_Vstore4(dst_ptr, 0, dst.s0123); \
                 break; \
                 case 5: \
                     VXC_Vstore2(dst_ptr, 0, dst.s01); \
@@ -165,7 +165,7 @@ _viv_uniform int inputZP;
                     VXC_Vstore3(dst_ptr, 0, dst.s012); \
                 break; \
                 case 7: \
-                    VXC_Vstore4(dst_ptr, 0, dst.0123); \
+                    VXC_Vstore4(dst_ptr, 0, dst.s0123); \
                      dst.s012 = dst.s456; \
                     dst_ptr += 4; \
                     VXC_Vstore3(dst_ptr, 0, dst.s012); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_i16.vx
index b4db308..07ede71 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_i16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/matrixmul_i16.vx
@@ -10,6 +10,11 @@ _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
 _viv_uniform int ac2zero;
 _viv_uniform int bc2zero;
 
+_viv_uniform VXC_512Bits uniI16MulI16SumtoI32_16x1;
+_viv_uniform VXC_512Bits uniI16MulI16SumtoI32B_16x1;
+_viv_uniform float inout_beta;
+_viv_uniform float inout_scale;
+
 #define GEMM_QINT_TO_QINT(src0_type_name, read_type) \
 __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \
         image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \
@@ -102,3 +107,139 @@ __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \
                 VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
 }
 GEMM_QINT_TO_QINT(I16, vxc_short8)
+
+__kernel void gemm_transb_I16I16toI16(image2d_array_t inputA,
+        image2d_array_t inputB, image2d_array_t output,
+        int transposeA, int transposeB, int adjointA, int adjointB,
+        uint M, uint K, uint N)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));
+    int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);
+    int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);
+
+    vxc_float4 sum0 = (vxc_float4)(0);
+    vxc_float4 sum1 = (vxc_float4)(0);
+    vxc_float4 sum2 = (vxc_float4)(0);
+    vxc_float4 sum3 = (vxc_float4)(0);
+
+    int8 inputA_desc, inputB_desc, output_desc;
+    _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));
+    int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;
+    _viv_asm(MOV, coord_a.w, baseAddr_a);
+    _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));
+    int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;
+    _viv_asm(MOV, coord_b.w, baseAddr_b);
+
+    for(coord_a.x = 0, coord_b.x = 0; coord_a.x < K;)
+    {
+        vxc_short8 srcA0,srcA1,srcA2,srcA3;
+        vxc_short8 srcB0,srcB1,srcB2,srcB3;
+        VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        coord_a.x += 8;
+        coord_b.x += 8;
+
+        vxc_int4 iVal;
+        vxc_float4 fpVal;
+        VXC_DP16x1(iVal, srcA0, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32_16x1);
+        VXC_DP16x1(fpVal, iVal, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32B_16x1);
+        VXC_DP16x1(iVal, srcA0, srcB1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32_16x1);
+        VXC_DP16x1(fpVal, iVal, srcB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32B_16x1);
+        VXC_DP16x1(iVal, srcA0, srcB2, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32_16x1);
+        VXC_DP16x1(fpVal, iVal, srcB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32B_16x1);
+        VXC_DP16x1(iVal, srcA0, srcB3, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32_16x1);
+        VXC_DP16x1(fpVal, iVal, srcB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32B_16x1);
+        sum0 = sum0 + fpVal * inout_scale + inout_beta;
+
+        VXC_DP16x1(iVal, srcA1, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32_16x1);
+        VXC_DP16x1(fpVal, iVal, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32B_16x1);
+        VXC_DP16x1(iVal, srcA1, srcB1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32_16x1);
+        VXC_DP16x1(fpVal, iVal, srcB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32B_16x1);
+        VXC_DP16x1(iVal, srcA1, srcB2, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32_16x1);
+        VXC_DP16x1(fpVal, iVal, srcB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32B_16x1);
+        VXC_DP16x1(iVal, srcA1, srcB3, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32_16x1);
+        VXC_DP16x1(fpVal, iVal, srcB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32B_16x1);
+        sum1 = sum1 + fpVal * inout_scale + inout_beta;
+
+        VXC_DP16x1(iVal, srcA2, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32_16x1);
+        VXC_DP16x1(fpVal, iVal, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32B_16x1);
+        VXC_DP16x1(iVal, srcA2, srcB1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32_16x1);
+        VXC_DP16x1(fpVal, iVal, srcB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32B_16x1);
+        VXC_DP16x1(iVal, srcA2, srcB2, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32_16x1);
+        VXC_DP16x1(fpVal, iVal, srcB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32B_16x1);
+        VXC_DP16x1(iVal, srcA2, srcB3, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32_16x1);
+        VXC_DP16x1(fpVal, iVal, srcB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32B_16x1);
+        sum2 = sum2 + fpVal * inout_scale + inout_beta;
+
+        VXC_DP16x1(iVal, srcA3, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32_16x1);
+        VXC_DP16x1(fpVal, iVal, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32B_16x1);
+        VXC_DP16x1(iVal, srcA3, srcB1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32_16x1);
+        VXC_DP16x1(fpVal, iVal, srcB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32B_16x1);
+        VXC_DP16x1(iVal, srcA3, srcB2, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32_16x1);
+        VXC_DP16x1(fpVal, iVal, srcB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32B_16x1);
+        VXC_DP16x1(iVal, srcA3, srcB3, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32_16x1);
+        VXC_DP16x1(fpVal, iVal, srcB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),
+            uniI16MulI16SumtoI32B_16x1);
+        sum3 = sum3 + fpVal * inout_scale + inout_beta;
+    }
+    vxc_int4 tmpOut0, tmpOut1;
+    vxc_short8 valDst;
+    tmpOut0 = convert_int4_rte(sum0);
+    tmpOut1 = convert_int4_rte(sum1);
+    VXC_DP2x8(valDst, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    tmpOut0 = convert_int4_rte(sum2);
+    tmpOut1 = convert_int4_rte(sum3);
+    VXC_DP2x8(valDst, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx
new file mode 100644
index 0000000..41f1c08
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_copy.vx
@@ -0,0 +1,86 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int bOrder;
+_viv_uniform int rOrder;
+
+_viv_uniform float outputScaleVar;
+_viv_uniform float bMeanScaleVarZp;
+_viv_uniform float gMeanScaleVarZp;
+_viv_uniform float rMeanScaleVarZp;
+
+_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;
+_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;
+_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;
+
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;
+
+#define NV12_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \
+__kernel void pre_process_nv12_copy_##name \
+    ( \
+    __read_only  image2d_array_t y_img, \
+    __read_only  image2d_array_t uv_img, \
+    __write_only image2d_array_t output, \
+    global       int*            xRatio, \
+    global       int*            yRatio, \
+    global       int*            xOffset, \
+    global       int*            yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           var, \
+                 int             reverse_channel, \
+                 int             trans \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+ \
+    int sy = gidy + (*yOffset); \
+    int sx = gidx + (*xOffset); \
+    int uvX = sx & 0xfffffffe; \
+    int uvY = sy >> 1; \
+ \
+    vxc_uchar16 Y, UV; \
+ \
+    VXC_ReadImage(Y, y_img, (int2)(sx,sy), 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(UV, uv_img,(int2)(uvX,uvY), 0,VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_char16 tmpUV; \
+    short tmpVal = 128; \
+    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \
+ \
+    float4 tmpDstB, tmpDstG, tmpDstR; \
+    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \
+    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \
+    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \
+ \
+    conv_type result; \
+    dst_type dst0; \
+    save_type dst; \
+    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
+    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstB); \
+    dstPos.z = bOrder; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstG); \
+    dstPos.z = 1; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstR); \
+    dstPos.z = rOrder; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+NV12_COPY_SH_IMPL(U8toU8,  vxc_uchar8, int4,  vxc_uchar8, 8)
+NV12_COPY_SH_IMPL(U8toI8,  vxc_char8,  int4,  vxc_char8,  8)
+NV12_COPY_SH_IMPL(U8toI16, vxc_short8, int4,  vxc_short8, 16)
+NV12_COPY_SH_IMPL(U8toF16, vxc_half8,  half4, vxc_short8, 16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx
index 4c6f935..ac6ba3d 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale.vx
@@ -8,151 +8,195 @@ _viv_uniform float bMeanScaleVarZp;
 _viv_uniform float gMeanScaleVarZp;
 _viv_uniform float rMeanScaleVarZp;
 
-_viv_uniform uint xrIntFloat_16;
-_viv_uniform uint yrIntFloat_16;
+_viv_uniform uint  xrIntFloat_16;
+_viv_uniform uint  yrIntFloat_16;
 
 _viv_uniform VXC_512Bits uniConvertNV12toB_4x4;
 _viv_uniform VXC_512Bits uniConvertNV12toG_4x4;
 _viv_uniform VXC_512Bits uniConvertNV12toR_4x4;
 
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
 _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;
+
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
 _viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;
 
-__kernel void pre_process_nv12_scale_U8toI16(
-    __read_only image2d_t y_img, __read_only image2d_t uv_img,
-    __write_only image2d_array_t    output,
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
-{
-    uint4 gidx = get_global_id(0);
-    uint gidy = get_global_id(1);
-    gidx += (uint4)(0, 1, 2, 3);
+_viv_uniform VXC_512Bits uniCalculateYShift_2x8;
+_viv_uniform VXC_512Bits uniCalculateUVShift_2x8;
 
-    uint dy = (gidy * yrIntFloat_16) >> 16;
-    uint4 dx = (gidx * xrIntFloat_16) >> 16;
-    int sy = convert_int(dy) + (*yOffset);
-    int4 sx = convert_int4(dx) + (*xOffset);
-    int4 uvX = sx & 0xfffffffe;
-    int uvY = sy >> 1;
-
-    vxc_uchar16 Y, UV;
-    int2 coord = (int2)(sx.x, sy);
-    int2 coord_uv = (int2)(uvX.x, uvY);
-
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord.x = sx.y;
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord.x = sx.z;
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord.x = sx.w;
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
-    coord_uv.x = uvX.y;
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
-    coord_uv.x = uvX.z;
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
-    coord_uv.x = uvX.w;
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
-
-    vxc_char16 tmpUV;
-    short tmpVal = 128;
-    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);
-
-    float4 tmpDstB, tmpDstG, tmpDstR;
-    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);
-    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);
-    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);
-
-    int4 result;
-    vxc_short8 dst;
-    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
-    result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);
-    dstPos.z = bOrder;
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-
-    result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);
-    dstPos.z = 1;
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-
-    result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);
-    dstPos.z = rOrder;
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
+#define NV12_OPT_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \
+__kernel void pre_process_nv12_scale_##name##_gq \
+    ( \
+    __read_only  image2d_array_t y_img, \
+    __read_only  image2d_array_t uv_img, \
+    __write_only image2d_array_t output, \
+    global       int*            xRatio, \
+    global       int*            yRatio, \
+    global       int*            xOffset, \
+    global       int*            yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           var, \
+                 int             reverse_channel, \
+                 int             trans \
+    ) \
+{ \
+    uint4 gidx = get_global_id(0); \
+    uint gidy = get_global_id(1); \
+    gidx += (uint4)(0, 1, 2, 3); \
+ \
+    uint dy = (gidy * yrIntFloat_16) >> 16; \
+    uint4 dx = (gidx * xrIntFloat_16) >> 16; \
+    int sy = convert_int(dy) + (*yOffset); \
+    int4 sx = convert_int4(dx) + (*xOffset); \
+    int4 uvX = sx & 0xfffffffe; \
+    int uvY = sy >> 1; \
+ \
+    vxc_uchar16 Y, UV; \
+    int2 coord = (int2)(sx.x, sy); \
+    int2 coord_uv = (int2)(uvX.x, uvY); \
+    VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; \
+    vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; \
+    int4 offsetUV = uvX - uvX.x; \
+ \
+    vxc_ushort8 diffY, diffUV; \
+    _viv_asm(COPY, diffY, sx, 16); \
+    _viv_asm(COPY, diffUV, offsetUV, 16); \
+ \
+    vxc_ushort8 constData = 8; \
+    VXC_DP2x8(maskShift, diffY, constData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \
+                uniCalculateYShift_2x8); \
+    VXC_DP2x8(maskShiftUv, diffUV, constData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                uniCalculateUVShift_2x8); \
+    VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_char16 tmpUV; \
+    short tmpVal = 128; \
+    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \
+ \
+    float4 tmpDstB, tmpDstG, tmpDstR; \
+    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \
+    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \
+    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \
+ \
+    conv_type result; \
+    dst_type dst0; \
+    save_type dst; \
+    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
+    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstB); \
+    dstPos.z = bOrder; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstG); \
+    dstPos.z = 1; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstR); \
+    dstPos.z = rOrder; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
 }
+NV12_OPT_SH_IMPL(U8toU8,  vxc_uchar8, int4,  vxc_uchar8, 8)
+NV12_OPT_SH_IMPL(U8toI8,  vxc_char8,  int4,  vxc_char8,  8)
+NV12_OPT_SH_IMPL(U8toI16, vxc_short8, int4,  vxc_short8, 16)
+NV12_OPT_SH_IMPL(U8toF16, vxc_half8,  half4, vxc_short8, 16)
 
-__kernel void pre_process_nv12_scale_U8toF16(
-    __read_only image2d_t y_img, __read_only image2d_t uv_img,
-    __write_only image2d_array_t    output,
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
-{
-    uint4 gidx = get_global_id(0);
-    uint gidy = get_global_id(1);
-    gidx += (uint4)(0, 1, 2, 3);
-
-    uint dy = (gidy * yrIntFloat_16) >> 16;
-    uint4 dx = (gidx * xrIntFloat_16) >> 16;
-    int sy = convert_int(dy) + (*yOffset);
-    int4 sx = convert_int4(dx) + (*xOffset);
-    int4 uvX = sx & 0xfffffffe;
-    int uvY = sy >> 1;
-
-    vxc_uchar16 Y, UV;
-    int2 coord = (int2)(sx.x, sy);
-    int2 coord_uv = (int2)(uvX.x, uvY);
-
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord.x = sx.y;
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord.x = sx.z;
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord.x = sx.w;
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
-    coord_uv.x = uvX.y;
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
-    coord_uv.x = uvX.z;
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
-    coord_uv.x = uvX.w;
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
-
-    vxc_char16 tmpUV;
-    short tmpVal = 128;
-    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);
-
-    float4 tmpDstB, tmpDstG, tmpDstR;
-    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);
-    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);
-    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);
-
-    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp;
-    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp;
-    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp;
-
-    half4 result;
-    vxc_half8 tmpdst;
-    vxc_short8 dst;
-    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
-    _viv_asm(CONV, result, tmpDstB);
-    dstPos.z = bOrder;
-    VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
-    _viv_asm(COPY, dst, tmpdst, 16);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(CONV, result, tmpDstG);
-    dstPos.z = 1;
-    VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
-    _viv_asm(COPY, dst, tmpdst, 16);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(CONV, result, tmpDstR);
-    dstPos.z = rOrder;
-    VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
-    _viv_asm(COPY, dst, tmpdst, 16);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-}
\ No newline at end of file
+#define NV12_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \
+__kernel void pre_process_nv12_scale_##name \
+    ( \
+    __read_only  image2d_array_t y_img, \
+    __read_only  image2d_array_t uv_img, \
+    __write_only image2d_array_t output, \
+    global       int*            xRatio, \
+    global       int*            yRatio, \
+    global       int*            xOffset, \
+    global       int*            yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           var, \
+                 int             reverse_channel, \
+                 int             trans \
+    ) \
+{ \
+    uint4 gidx = get_global_id(0); \
+    uint gidy = get_global_id(1); \
+    gidx += (uint4)(0, 1, 2, 3); \
+ \
+    uint dy = (gidy * yrIntFloat_16) >> 16; \
+    uint4 dx = (gidx * xrIntFloat_16) >> 16; \
+    int sy = convert_int(dy) + (*yOffset); \
+    int4 sx = convert_int4(dx) + (*xOffset); \
+    int4 uvX = sx & 0xfffffffe; \
+    int uvY = sy >> 1; \
+ \
+    vxc_uchar16 Y, UV; \
+    int2 coord = (int2)(sx.x, sy); \
+    int2 coord_uv = (int2)(uvX.x, uvY); \
+ \
+    VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    coord.x = sx.y; \
+    VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord.x = sx.z; \
+    VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    coord.x = sx.w; \
+    VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_uv.x = uvX.y; \
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_uv.x = uvX.z; \
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_uv.x = uvX.w; \
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    vxc_char16 tmpUV; \
+    short tmpVal = 128; \
+    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \
+ \
+    float4 tmpDstB, tmpDstG, tmpDstR; \
+    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \
+    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \
+    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \
+ \
+    conv_type result; \
+    dst_type dst0; \
+    save_type dst; \
+    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
+    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstB); \
+    dstPos.z = bOrder; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstG); \
+    dstPos.z = 1; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstR); \
+    dstPos.z = rOrder; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+NV12_SH_IMPL(U8toU8,  vxc_uchar8, int4,  vxc_uchar8, 8)
+NV12_SH_IMPL(U8toI8,  vxc_char8,  int4,  vxc_char8,  8)
+NV12_SH_IMPL(U8toI16, vxc_short8, int4,  vxc_short8, 16)
+NV12_SH_IMPL(U8toF16, vxc_half8,  half4, vxc_short8, 16)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale_8bits.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale_8bits.vx
deleted file mode 100644
index c274c3c..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale_8bits.vx
+++ /dev/null
@@ -1,197 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform int bOrder;
-_viv_uniform int rOrder;
-
-_viv_uniform float outputScaleVar;
-_viv_uniform float bMeanScaleVarZp;
-_viv_uniform float gMeanScaleVarZp;
-_viv_uniform float rMeanScaleVarZp;
-
-_viv_uniform uint  xrIntFloat_16;
-_viv_uniform uint  yrIntFloat_16;
-
-_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;
-_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;
-_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;
-
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;
-_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;
-
-__kernel void pre_process_nv12_scale_U8toU8(
-    __read_only image2d_t y_img, __read_only image2d_t uv_img,
-    __write_only image2d_array_t    output,
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
-{
-    uint4 gidx = get_global_id(0);
-    uint gidy = get_global_id(1);
-    gidx += (uint4)(0, 1, 2, 3);
-
-    uint dy = (gidy * yrIntFloat_16) >> 16;
-    uint4 dx = (gidx * xrIntFloat_16) >> 16;
-    int sy = convert_int(dy) + (*yOffset);
-    int4 sx = convert_int4(dx) + (*xOffset);
-    int4 uvX = sx & 0xfffffffe;
-    int uvY = sy >> 1;
-
-    vxc_uchar16 Y, UV;
-    int2 coord = (int2)(sx.x, sy);
-    int2 coord_uv = (int2)(uvX.x, uvY);
-
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord.x = sx.y;
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord.x = sx.z;
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord.x = sx.w;
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
-    coord_uv.x = uvX.y;
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
-    coord_uv.x = uvX.z;
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
-    coord_uv.x = uvX.w;
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
-
-    vxc_char16 tmpUV;
-    short tmpVal = 128;
-    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);
-
-    float4 tmpDstB, tmpDstG, tmpDstR;
-    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);
-    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);
-    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);
-
-    int4 result;
-    vxc_uchar8 dst;
-    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
-    result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);
-    dstPos.z = bOrder;
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-
-    result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);
-    dstPos.z = 1;
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-
-    result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);
-    dstPos.z = rOrder;
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pre_process_nv12_copy_U8toU8(
-    __read_only image2d_t y_img, __read_only image2d_t uv_img,
-    __write_only image2d_array_t    output,
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
-{
-    int gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-
-    int sy = gidy + (*yOffset);
-    int sx = gidx + (*xOffset);
-    int uvX = sx & 0xfffffffe;
-    int uvY = sy >> 1;
-
-    vxc_uchar16 Y, UV;
-
-    VXC_ReadImage(Y, y_img, (int2)(sx,sy), VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(UV, uv_img,(int2)(uvX,uvY), VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-
-    vxc_char16 tmpUV;
-    short tmpVal = 128;
-    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8);
-
-    float4 tmpDstB, tmpDstG, tmpDstR;
-    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);
-    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);
-    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);
-
-    int4 result;
-    vxc_uchar8 dst;
-    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
-    result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);
-    dstPos.z = bOrder;
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-
-    result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);
-    dstPos.z = 1;
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-
-    result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);
-    dstPos.z = rOrder;
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pre_process_nv12_scale_U8toI8(
-    __read_only image2d_t y_img, __read_only image2d_t uv_img,
-    __write_only image2d_array_t    output,
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
-{
-    uint4 gidx = get_global_id(0);
-    uint gidy = get_global_id(1);
-    gidx += (uint4)(0, 1, 2, 3);
-
-    uint dy = (gidy * yrIntFloat_16) >> 16;
-    uint4 dx = (gidx * xrIntFloat_16) >> 16;
-    int sy = convert_int(dy) + (*yOffset);
-    int4 sx = convert_int4(dx) + (*xOffset);
-    int4 uvX = sx & 0xfffffffe;
-    int uvY = sy >> 1;
-
-    vxc_uchar16 Y, UV;
-    int2 coord = (int2)(sx.x, sy);
-    int2 coord_uv = (int2)(uvX.x, uvY);
-
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    coord.x = sx.y;
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    coord.x = sx.z;
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    coord.x = sx.w;
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
-    coord_uv.x = uvX.y;
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
-    coord_uv.x = uvX.z;
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
-    coord_uv.x = uvX.w;
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
-
-    vxc_char16 tmpUV;
-    short tmpVal = 128;
-    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);
-
-    float4 tmpDstB, tmpDstG, tmpDstR;
-    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);
-    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);
-    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);
-
-    int4 result;
-    vxc_char8 dst;
-    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
-    result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);
-    dstPos.z = bOrder;
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-
-    result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);
-    dstPos.z = 1;
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-
-    result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);
-    dstPos.z = rOrder;
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale_mix.vx
deleted file mode 100644
index 0a4551f..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_nv12_scale_mix.vx
+++ /dev/null
@@ -1,162 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform int bOrder;
-_viv_uniform int rOrder;
-
-_viv_uniform float outputScaleVar;
-_viv_uniform float bMeanScaleVarZp;
-_viv_uniform float gMeanScaleVarZp;
-_viv_uniform float rMeanScaleVarZp;
-
-_viv_uniform uint  xrIntFloat_16;
-_viv_uniform uint  yrIntFloat_16;
-
-_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;
-_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;
-_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;
-
-_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;
-
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;
-
-_viv_uniform VXC_512Bits uniCalculateYShift_2x8;
-_viv_uniform VXC_512Bits uniCalculateUVShift_2x8;
-
-__kernel void pre_process_nv12_scale_U8toU8_gq(
-    __read_only image2d_t y_img, __read_only image2d_t uv_img,
-    __write_only image2d_array_t    output,
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
-{
-    uint4 gidx = get_global_id(0);
-    uint gidy = get_global_id(1);
-    gidx += (uint4)(0, 1, 2, 3);
-
-    uint dy = (gidy * yrIntFloat_16) >> 16;
-    uint4 dx = (gidx * xrIntFloat_16) >> 16;
-    int sy = convert_int(dy) + (*yOffset);
-    int4 sx = convert_int4(dx) + (*xOffset);
-    int4 uvX = sx & 0xfffffffe;
-    int uvY = sy >> 1;
-
-    vxc_uchar16 Y, UV;
-    int2 coord = (int2)(sx.x, sy);
-    int2 coord_uv = (int2)(uvX.x, uvY);
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
-    vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
-    int4 offsetUV = uvX - uvX.x;
-
-    vxc_ushort8 diffY, diffUV;
-    _viv_asm(COPY, diffY, sx, 16);
-    _viv_asm(COPY, diffUV, offsetUV, 16);
-
-    vxc_ushort8 constData = 8;
-    VXC_DP2x8(maskShift, diffY, constData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniCalculateYShift_2x8);
-    VXC_DP2x8(maskShiftUv, diffUV, constData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniCalculateUVShift_2x8);
-    VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-    VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    vxc_char16 tmpUV;
-    short tmpVal = 128;
-    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);
-
-    float4 tmpDstB, tmpDstG, tmpDstR;
-    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);
-    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);
-    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);
-
-    int4 result;
-    vxc_uchar8 dst;
-    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
-    result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);
-    dstPos.z = bOrder;
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-
-    result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);
-    dstPos.z = 1;
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-
-    result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);
-    dstPos.z = rOrder;
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pre_process_nv12_scale_U8toF16_gq(
-    __read_only image2d_t y_img, __read_only image2d_t uv_img,
-    __write_only image2d_array_t    output,
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
-{
-    uint4 gidx = get_global_id(0);
-    uint gidy = get_global_id(1);
-    gidx += (uint4)(0, 1, 2, 3);
-
-    uint dy = (gidy * yrIntFloat_16) >> 16;
-    uint4 dx = (gidx * xrIntFloat_16) >> 16;
-    int sy = convert_int(dy) + (*yOffset);
-    int4 sx = convert_int4(dx) + (*xOffset);
-    int4 uvX = sx & 0xfffffffe;
-    int uvY = sy >> 1;
-
-    vxc_uchar16 Y, UV;
-    int2 coord = (int2)(sx.x, sy);
-    int2 coord_uv = (int2)(uvX.x, uvY);
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-
-    vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
-    vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
-    int4 offsetUV = uvX - uvX.x;
-
-    vxc_ushort8 diffY, diffUV;
-    _viv_asm(COPY, diffY, sx, 16);
-    _viv_asm(COPY, diffUV, offsetUV, 16);
-
-    vxc_ushort8 constData = 8;
-    VXC_DP2x8(maskShift, diffY, constData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniCalculateYShift_2x8);
-    VXC_DP2x8(maskShiftUv, diffUV, constData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniCalculateUVShift_2x8);
-    VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-    VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    vxc_char16 tmpUV;
-    short tmpVal = 128;
-    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);
-
-    float4 tmpDstB, tmpDstG, tmpDstR;
-    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);
-    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);
-    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);
-
-    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp;
-    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp;
-    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp;
-
-    half4 result;
-    vxc_half8 tmpdst;
-    vxc_short8 dst;
-    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
-    _viv_asm(CONV, result, tmpDstB);
-    dstPos.z = bOrder;
-    VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
-    _viv_asm(COPY, dst, tmpdst, 16);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(CONV, result, tmpDstG);
-    dstPos.z = 1;
-    VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
-    _viv_asm(COPY, dst, tmpdst, 16);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-
-    _viv_asm(CONV, result, tmpDstR);
-    dstPos.z = rOrder;
-    VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
-    _viv_asm(COPY, dst, tmpdst, 16);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy.vx
new file mode 100644
index 0000000..25f981a
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy.vx
@@ -0,0 +1,238 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniCalculateTmpR1st_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpR2nd_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpR3rd_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpR4th_4x4;
+_viv_uniform VXC_512Bits uniCalculateR1st_4x4;
+
+_viv_uniform VXC_512Bits uniCalculateTmpG1st_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpG2nd_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpG3rd_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpG4th_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;
+
+_viv_uniform VXC_512Bits uniCalculateG1st_4x4;
+_viv_uniform VXC_512Bits uniCalculateG2nd_4x4;
+_viv_uniform VXC_512Bits uniCalculateG3rd_4x4;
+_viv_uniform VXC_512Bits uniCalculateG4th_4x4;
+
+_viv_uniform VXC_512Bits uniCalculateTmpB1st_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpB2nd_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpB3rd_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpB4th_4x4;
+_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
+
+_viv_uniform VXC_512Bits uniQuantU8toU8LoB_2x8;
+_viv_uniform VXC_512Bits uniQuantU8toU8HiB_2x8;
+_viv_uniform VXC_512Bits uniQuantU8toU8LoG_2x8;
+_viv_uniform VXC_512Bits uniQuantU8toU8HiG_2x8;
+_viv_uniform VXC_512Bits uniQuantU8toU8LoR_2x8;
+_viv_uniform VXC_512Bits uniQuantU8toU8HiR_2x8;
+
+_viv_uniform int bOrder;
+_viv_uniform int rOrder;
+_viv_uniform float output_zp;
+_viv_uniform float output_scale;
+
+#define YUV420_COPY_SH_IMPL(name, dst_type) \
+__kernel void pre_process_yuv420_copy_##name \
+    ( \
+    __read_only  image2d_array_t y_img, \
+    __read_only  image2d_array_t u_img, \
+    __read_only  image2d_array_t v_img, \
+    __write_only image2d_array_t output, \
+          global int *           xRatio, \
+          global int *           yRatio, \
+          global int *           xOffset, \
+          global int *           yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           var, \
+                 int             reverse_channel, \
+                 int             trans \
+    ) \
+{ \
+    int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); \
+    int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1); \
+    vxc_uchar16 Y; \
+    vxc_uchar8 U, V; \
+    vxc_int4 C0, C1, C2, C3; \
+    vxc_uchar16 R, G, B; \
+    dst_type dst0, dst1, dst2; \
+ \
+    VXC_ReadImage(Y, y_img, pos.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(U, u_img, pos1.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, pos1.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    /*C = Y - 16;*/ \
+    /*D = U - 128;*/ \
+    /*E = V - 128;*/ \
+    /* calculate R*/ \
+    /* ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]*/ \
+    int tmpV = -56992; \
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4); \
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4); \
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4); \
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4); \
+ \
+    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
+    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
+    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
+    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
+ \
+    /* calculate G*/ \
+    /* ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]*/ \
+    /* 298Y - 208V*/ \
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4); \
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4); \
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4); \
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4); \
+    /* 34784 - 100U*/ \
+    ushort tmpG = 34784; \
+    vxc_ushort8 tmpDstG; \
+    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \
+    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); \
+    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); \
+    VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4); \
+    VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4); \
+ \
+    /* calculate B*/ \
+    /* ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]*/ \
+    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4); \
+    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4); \
+    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4); \
+    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4); \
+    tmpV = -70688; \
+    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
+    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
+    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
+    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
+ \
+    var *= output_scale; \
+    float4  paramData = (float4)(bMean * var - output_zp, gMean * var - output_zp, \
+        rMean * var - output_zp, var); \
+    half4 paramData_f16; \
+    _viv_asm(CONV, paramData_f16, paramData); \
+ \
+    VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); \
+    VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); \
+ \
+    VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); \
+    VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); \
+ \
+    VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); \
+    VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); \
+ \
+    pos = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
+    pos.z = bOrder; \
+    VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    pos.z = 1; \
+    VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    pos.z = rOrder; \
+    VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+}
+YUV420_COPY_SH_IMPL(U8toU8, vxc_uchar16)
+YUV420_COPY_SH_IMPL(U8toI8, vxc_char16)
+
+#define YUV420_COPY_16BITS_SH_IMPL(name, dst_type) \
+__kernel void pre_process_yuv420_copy_##name \
+    ( \
+    __read_only  image2d_array_t y_img, \
+    __read_only  image2d_array_t u_img, \
+    __read_only  image2d_array_t v_img, \
+    __write_only image2d_array_t output, \
+          global int *           xRatio, \
+          global int *           yRatio, \
+          global int *           xOffset, \
+          global int *           yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           var, \
+                 int             reverse_channel, \
+                 int             trans \
+    ) \
+{ \
+    int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); \
+    int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1); \
+    vxc_uchar16 Y; \
+    vxc_uchar8 U, V; \
+    vxc_int4 C0, C1, C2, C3; \
+    vxc_uchar16 R, G, B; \
+    dst_type dst0, dst1, dst2, dst3, dst4, dst5; \
+    vxc_short8 out0, out1, out2, out3, out4, out5; \
+ \
+    VXC_ReadImage(Y, y_img, pos.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(U, u_img, pos1.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, pos1.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int tmpV = -56992; \
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4); \
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4); \
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4); \
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4); \
+ \
+    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
+    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
+    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
+    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
+ \
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4); \
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4); \
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4); \
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4); \
+ \
+    ushort tmpG = 34784; \
+    vxc_ushort8 tmpDstG; \
+    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \
+    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); \
+    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); \
+    VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4); \
+    VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4); \
+ \
+    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4); \
+    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4); \
+    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4); \
+    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4); \
+    tmpV = -70688; \
+    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
+    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
+    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
+    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
+ \
+    var *= output_scale; \
+    float4  paramData = (float4)(bMean * var - output_zp, gMean * var - output_zp, \
+        rMean * var - output_zp, var); \
+    half4 paramData_f16; \
+    _viv_asm(CONV, paramData_f16, paramData); \
+ \
+    VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); \
+    VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); \
+ \
+    VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); \
+    VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); \
+ \
+    VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); \
+    VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); \
+ \
+    _viv_asm(COPY, out0, dst0, 16); \
+    _viv_asm(COPY, out1, dst1, 16); \
+    _viv_asm(COPY, out2, dst2, 16); \
+    _viv_asm(COPY, out3, dst3, 16); \
+    _viv_asm(COPY, out4, dst4, 16); \
+    _viv_asm(COPY, out5, dst5, 16); \
+ \
+    pos = (int4)(get_global_id(0), get_global_id(1), bOrder, get_global_id(0) + 8); \
+    VXC_WriteImage2DArray(output, pos.xyzz, out0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage2DArray(output, pos.wyzz, out1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    pos.z = 1; \
+    VXC_WriteImage2DArray(output, pos.xyzz, out2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage2DArray(output, pos.wyzz, out3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    pos.z = rOrder; \
+    VXC_WriteImage2DArray(output, pos.xyzz, out4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage2DArray(output, pos.wyzz, out5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+}
+YUV420_COPY_16BITS_SH_IMPL(U8toF16, vxc_half8)
+YUV420_COPY_16BITS_SH_IMPL(U8toI16, vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx
deleted file mode 100644
index bce976c..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_copy_u8.vx
+++ /dev/null
@@ -1,240 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniCalculateTmpR1st_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpR2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpR3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpR4th_4x4;
-_viv_uniform VXC_512Bits uniCalculateR1st_4x4;
-
-_viv_uniform VXC_512Bits uniCalculateTmpG1st_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpG2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpG3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpG4th_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;
-
-_viv_uniform VXC_512Bits uniCalculateG1st_4x4;
-_viv_uniform VXC_512Bits uniCalculateG2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateG3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateG4th_4x4;
-
-_viv_uniform VXC_512Bits uniCalculateTmpB1st_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpB2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpB3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpB4th_4x4;
-_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
-
-_viv_uniform VXC_512Bits uniQuantU8toU8LoB_2x8;
-_viv_uniform VXC_512Bits uniQuantU8toU8HiB_2x8;
-_viv_uniform VXC_512Bits uniQuantU8toU8LoG_2x8;
-_viv_uniform VXC_512Bits uniQuantU8toU8HiG_2x8;
-_viv_uniform VXC_512Bits uniQuantU8toU8LoR_2x8;
-_viv_uniform VXC_512Bits uniQuantU8toU8HiR_2x8;
-
-_viv_uniform int bOrder;
-_viv_uniform int rOrder;
-_viv_uniform int zp;
-_viv_uniform float outputScale;
-
-__kernel void pre_process_yuv420_copy_U8toU8(
-    __read_only image2d_t            y_img,
-    __read_only image2d_t            u_img,
-    __read_only image2d_t            v_img,
-    __write_only image2d_array_t    output,
-        global int *                xRatio,
-        global int *                yRatio,
-        global int *               xOffset,
-        global int *               yOffset,
-               float                 rMean,
-               float                 gMean,
-               float                 bMean,
-               float                   var,
-               int         reverse_channel,
-               int                   trans
-    )
-{
-    int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);
-    int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1);
-    vxc_uchar16 Y;
-    vxc_uchar8 U, V;
-    vxc_int4 C0, C1, C2, C3;
-    vxc_uchar16 R, G, B;
-    vxc_uchar16 dst0, dst1, dst2;
-
-    VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    //C = Y - 16;
-    //D = U - 128;
-    //E = V - 128;
-    // calculate R
-    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]
-    int tmpV = -56992;
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);
-
-    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-
-    // calculate G
-    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]
-    // 298Y - 208V
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);
-    // 34784 - 100U
-    ushort tmpG = 34784;
-    vxc_ushort8 tmpDstG;
-    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);
-    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);
-    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);
-    VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4);
-    VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4);
-
-    // calculate B
-    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);
-    tmpV = -70688;
-    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-
-    var *= outputScale;
-    float4  paramData = (float4)(bMean * var - zp, gMean * var - zp,\
-        rMean * var - zp, var);
-    half4 paramData_f16;
-    _viv_asm(CONV, paramData_f16, paramData);
-
-    VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);
-    VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);
-
-    VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);
-    VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);
-
-    VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);
-    VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);
-
-    pos = (int4)(get_global_id(0), get_global_id(1), 0, 0);
-    pos.z = bOrder;
-    VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    pos.z = 1;
-    VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    pos.z = rOrder;
-    VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-}
-
-__kernel void pre_process_yuv420_copy_U8toF16(
-    __read_only image2d_t            y_img,
-    __read_only image2d_t            u_img,
-    __read_only image2d_t            v_img,
-    __write_only image2d_array_t    output,
-        global int *                xRatio,
-        global int *                yRatio,
-        global int *               xOffset,
-        global int *               yOffset,
-               float                 rMean,
-               float                 gMean,
-               float                 bMean,
-               float                   var,
-               int         reverse_channel,
-               int                   trans
-    )
-{
-    int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);
-    int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1);
-    vxc_uchar16 Y;
-    vxc_uchar8 U, V;
-    vxc_int4 C0, C1, C2, C3;
-    vxc_uchar16 R, G, B;
-    vxc_half8 dst0, dst1, dst2, dst3, dst4, dst5;
-    vxc_short8 out0, out1, out2, out3, out4, out5;
-
-    VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-
-    //C = Y - 16;
-    //D = U - 128;
-    //E = V - 128;
-    // calculate R
-    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]
-    int tmpV = -56992;
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);
-
-    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-
-    // calculate G
-    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]
-    // 298Y - 208V
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);
-    // 34784 - 100U
-    ushort tmpG = 34784;
-    vxc_ushort8 tmpDstG;
-    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);
-    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);
-    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);
-    VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4);
-    VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4);
-
-    // calculate B
-    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);
-    tmpV = -70688;
-    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-
-    float4  paramData = (float4)(bMean * var, gMean * var,\
-        rMean * var, var);
-    half4 paramData_f16;
-    _viv_asm(CONV, paramData_f16, paramData);
-
-    VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);
-    VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);
-
-    VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);
-    VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);
-
-    VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);
-    VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);
-
-    _viv_asm(COPY, out0, dst0, 16);
-    _viv_asm(COPY, out1, dst1, 16);
-    _viv_asm(COPY, out2, dst2, 16);
-    _viv_asm(COPY, out3, dst3, 16);
-    _viv_asm(COPY, out4, dst4, 16);
-    _viv_asm(COPY, out5, dst5, 16);
-
-    pos = (int4)(get_global_id(0), get_global_id(1), bOrder, get_global_id(0) + 8);
-    VXC_WriteImage2DArray(output, pos.xyzz, out0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage2DArray(output, pos.wyzz, out1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    pos.z = 1;
-    VXC_WriteImage2DArray(output, pos.xyzz, out2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage2DArray(output, pos.wyzz, out3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    pos.z = rOrder;
-    VXC_WriteImage2DArray(output, pos.xyzz, out4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_WriteImage2DArray(output, pos.wyzz, out5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
-}
-
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_0.vx
new file mode 100644
index 0000000..40db137
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_0.vx
@@ -0,0 +1,237 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniCalculateR1st_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;
+_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;
+
+_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform VXC_512Bits uniDescaleU8_4x4;
+
+_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;
+_viv_uniform VXC_512Bits uniCalculateGWise_4x4;
+_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;
+
+_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;
+
+_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;
+_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;
+_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;
+_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;
+
+_viv_uniform int bOrder;
+_viv_uniform int rOrder;
+_viv_uniform float output_zp;
+_viv_uniform float output_scale;
+
+#define YUV420_SCALE_8BITS_SH_IMPL(name, dst_type) \
+__kernel void pre_process_yuv420_scale_##name \
+    ( \
+    __read_only  image2d_array_t y_img, \
+    __read_only  image2d_array_t u_img, \
+    __read_only  image2d_array_t v_img, \
+    __write_only image2d_array_t output, \
+          global int *           xRatio, \
+          global int *           yRatio, \
+          global int *           xOffset, \
+          global int *           yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           var, \
+                 int             reverse_channel, \
+                 int             trans \
+    ) \
+{ \
+    int4 gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    gidx += (int4)(0, 1, 2, 3); \
+ \
+    int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); \
+    int4 sx = fx & 0xffff8000; \
+    int fy, sy; \
+    fx -= sx; \
+    sx = sx >> 15; \
+    fx = (fx +(1 << 4)) >> 5; \
+ \
+    fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); \
+    sy = fy & 0xffff8000; \
+    fy -= sy; \
+    sy = sy >> 15; \
+ \
+    sy = sy < 0 ? 0 : sy; \
+    fy = fy < 0 ? 0 : fy; \
+ \
+    fy = (fy + (1<< 4)) >> 5; \
+    sx += (*xOffset); \
+    sy += (*yOffset); \
+    int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0); \
+    int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0); \
+    int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0); \
+ \
+    vxc_uchar16 Y, U, V; \
+    vxc_int4 C0, C1, C2, C3; \
+    vxc_uchar16 R, G, B; \
+ \
+    VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    srcPos1.x = (sx.x + 1) >> 1; \
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    srcPos2.x = (sx.x + 1) >> 1; \
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    srcPos.x = sx.y; \
+    srcPos1.x = sx.y >> 1; \
+    srcPos2.x = sx.y >> 1; \
+    VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \
+    srcPos1.x = (sx.y + 1) >> 1; \
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \
+    srcPos2.x = (sx.y + 1) >> 1; \
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    srcPos.x = sx.z; \
+    srcPos1.x = sx.z >> 1; \
+    srcPos2.x = sx.z >> 1; \
+    VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); \
+    srcPos1.x = (sx.z + 1) >> 1; \
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); \
+    srcPos2.x = (sx.z + 1) >> 1; \
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); \
+ \
+    srcPos.x = sx.w; \
+    srcPos1.x = sx.w >> 1; \
+    srcPos2.x = sx.w >> 1; \
+    VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); \
+    srcPos1.x = (sx.w + 1) >> 1; \
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); \
+    srcPos2.x = (sx.w + 1) >> 1; \
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int tmpV = -56992; \
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); \
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); \
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); \
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); \
+    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
+    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
+    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
+    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
+ \
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); \
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); \
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); \
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); \
+ \
+    ushort tmpG = 34784; \
+    vxc_ushort8 tmpDstG, tmpDstG1; \
+    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \
+    VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); \
+    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \
+    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \
+    VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \
+    VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \
+ \
+    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); \
+    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); \
+    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); \
+    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); \
+    tmpV = -70688; \
+    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
+    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
+    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
+    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
+ \
+    int4 result, temp1, temp2; \
+    int4 tmpData0, tmpData1; \
+ \
+    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \
+    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \
+    temp1 = fx * tmpData0 + tmpData1; \
+ \
+    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \
+    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \
+    temp2 = fx * tmpData0 + tmpData1; \
+    result = fy * temp2 + (temp1 << 10); \
+ \
+    tmpV = 1 << 19; \
+    dst_type dst; \
+    float4 tmpDst; \
+    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
+    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
+    tmpDst = (tmpDst - bMean) * var; \
+    dstPos.z = bOrder; \
+    result = convert_int4_rte(tmpDst * output_scale + output_zp); \
+    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \
+    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \
+    temp1 = fx * tmpData0 + tmpData1; \
+    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \
+    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \
+    temp2 = fx * tmpData0 + tmpData1; \
+    result = fy * temp2 + (temp1 << 10); \
+    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
+    tmpDst = (tmpDst - gMean) * var; \
+    dstPos.z = 1; \
+    result = convert_int4_rte(tmpDst * output_scale + output_zp); \
+    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \
+    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \
+    temp1 = fx * tmpData0 + tmpData1; \
+    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \
+    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \
+    temp2 = fx * tmpData0 + tmpData1; \
+    result = fy * temp2 + (temp1 << 10); \
+    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
+    tmpDst = (tmpDst - rMean) * var; \
+    dstPos.z = rOrder; \
+    result = convert_int4_rte(tmpDst * output_scale + output_zp); \
+    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+YUV420_SCALE_8BITS_SH_IMPL(U8toU8, vxc_uchar8)
+YUV420_SCALE_8BITS_SH_IMPL(U8toI8, vxc_char8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_1.vx
new file mode 100644
index 0000000..7bfa6d1
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_1.vx
@@ -0,0 +1,245 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniCalculateR1st_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;
+_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;
+
+_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform VXC_512Bits uniDescaleU8_4x4;
+
+_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;
+_viv_uniform VXC_512Bits uniCalculateGWise_4x4;
+_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;
+
+_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;
+_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;
+
+_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;
+_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;
+_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;
+_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;
+
+_viv_uniform int bOrder;
+_viv_uniform int rOrder;
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+
+#define YUV420_SCALE_16BITS_SH_IMPL(name, dst_type, conv_type) \
+__kernel void pre_process_yuv420_scale_##name \
+    ( \
+    __read_only  image2d_array_t y_img, \
+    __read_only  image2d_array_t u_img, \
+    __read_only  image2d_array_t v_img, \
+    __write_only image2d_array_t output, \
+          global int *           xRatio, \
+          global int *           yRatio, \
+          global int *           xOffset, \
+          global int *           yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           var, \
+                 int             reverse_channel, \
+                 int             trans \
+    ) \
+{ \
+    int4 gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    gidx += (int4)(0, 1, 2, 3); \
+ \
+    int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); \
+    int4 sx = fx & 0xffff8000; \
+    int fy, sy; \
+    fx -= sx; \
+    sx = sx >> 15; \
+    fx = (fx +(1 << 4)) >> 5; \
+ \
+    fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); \
+    sy = fy & 0xffff8000; \
+    fy -= sy; \
+    sy = sy >> 15; \
+ \
+    sy = sy < 0 ? 0 : sy; \
+    fy = fy < 0 ? 0 : fy; \
+ \
+    fy = (fy + (1<< 4)) >> 5; \
+    sx += (*xOffset); \
+    sy += (*yOffset); \
+    int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0); \
+    int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0); \
+    int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0); \
+ \
+    vxc_uchar16 Y, U, V; \
+    vxc_int4 C0, C1, C2, C3; \
+    vxc_uchar16 R, G, B; \
+ \
+    VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    srcPos1.x = (sx.x + 1) >> 1; \
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    srcPos2.x = (sx.x + 1) >> 1; \
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    srcPos.x = sx.y; \
+    srcPos1.x = sx.y >> 1; \
+    srcPos2.x = sx.y >> 1; \
+    VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \
+    srcPos1.x = (sx.y + 1) >> 1; \
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \
+    srcPos2.x = (sx.y + 1) >> 1; \
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    srcPos.x = sx.z; \
+    srcPos1.x = sx.z >> 1; \
+    srcPos2.x = sx.z >> 1; \
+    VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); \
+    srcPos1.x = (sx.z + 1) >> 1; \
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); \
+    srcPos2.x = (sx.z + 1) >> 1; \
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); \
+ \
+    srcPos.x = sx.w; \
+    srcPos1.x = sx.w >> 1; \
+    srcPos2.x = sx.w >> 1; \
+    VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); \
+    srcPos1.x = (sx.w + 1) >> 1; \
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); \
+    srcPos2.x = (sx.w + 1) >> 1; \
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); \
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); \
+ \
+    int tmpV = -56992; \
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); \
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); \
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); \
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); \
+    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
+    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
+    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
+    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \
+ \
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); \
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); \
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); \
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); \
+ \
+    ushort tmpG = 34784; \
+    vxc_ushort8 tmpDstG, tmpDstG1; \
+    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \
+    VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); \
+    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \
+    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \
+    VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \
+    VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \
+ \
+    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); \
+    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); \
+    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); \
+    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); \
+    tmpV = -70688; \
+    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
+    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
+    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
+    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \
+ \
+    int4 result, temp1, temp2; \
+    int4 tmpData0, tmpData1; \
+    dst_type tmpResult; \
+    conv_type tmpVal; \
+ \
+    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \
+    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \
+    temp1 = fx * tmpData0 + tmpData1; \
+ \
+    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \
+    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \
+    temp2 = fx * tmpData0 + tmpData1; \
+    result = fy * temp2 + (temp1 << 10); \
+ \
+    tmpV = 1 << 19; \
+    vxc_short8 dst; \
+    float4 tmpDst; \
+    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \
+    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
+    tmpDst = (tmpDst - bMean) * var; \
+    dstPos.z = bOrder; \
+    tmpDst = tmpDst * output_scale + output_zp; \
+    _viv_asm(CONV_RTE, tmpVal, tmpDst); \
+    VXC_DP2x8(tmpResult, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, tmpResult, 8); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \
+    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \
+    temp1 = fx * tmpData0 + tmpData1; \
+    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \
+    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \
+    temp2 = fx * tmpData0 + tmpData1; \
+    result = fy * temp2 + (temp1 << 10); \
+    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
+    tmpDst = (tmpDst - gMean) * var; \
+    dstPos.z = 1; \
+    tmpDst = tmpDst * output_scale + output_zp; \
+    _viv_asm(CONV_RTE, tmpVal, tmpDst); \
+    VXC_DP2x8(tmpResult, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, tmpResult, 8); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \
+    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \
+    temp1 = fx * tmpData0 + tmpData1; \
+    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \
+    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \
+    temp2 = fx * tmpData0 + tmpData1; \
+    result = fy * temp2 + (temp1 << 10); \
+    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \
+    tmpDst = (tmpDst - rMean) * var; \
+    dstPos.z = rOrder; \
+    tmpDst = tmpDst * output_scale + output_zp; \
+    _viv_asm(CONV_RTE, tmpVal, tmpDst); \
+    VXC_DP2x8(tmpResult, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, tmpResult, 8); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+YUV420_SCALE_16BITS_SH_IMPL(U8toF16, vxc_half8,  half4)
+YUV420_SCALE_16BITS_SH_IMPL(U8toI16, vxc_short8, int4)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_fp16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_fp16.vx
deleted file mode 100644
index 9d4e331..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_fp16.vx
+++ /dev/null
@@ -1,232 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniCalculateR1st_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;
-
-_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
-_viv_uniform VXC_512Bits uniDescaleU8_4x4;
-
-_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;
-_viv_uniform VXC_512Bits uniCalculateGWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;
-
-_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;
-
-_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;
-
-_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;
-
-_viv_uniform int bOrder;
-_viv_uniform int rOrder;
-
-__kernel void pre_process_yuv420_scale_U8toF16(
-    __read_only image2d_array_t y_img, __read_only image2d_array_t u_img,
-    __read_only image2d_array_t v_img, __write_only image2d_array_t    output,
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
-{
-    int4 gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    gidx += (int4)(0, 1, 2, 3);
-
-    int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);
-    int4 sx = fx & 0xffff8000; // Floor
-    int fy, sy;
-    fx -= sx;
-    sx = sx >> 15;
-    fx = (fx +(1 << 4)) >> 5;
-
-    // for y
-    fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);
-    sy = fy & 0xffff8000; // Floor
-    fy -= sy;
-    sy = sy >> 15;
-
-    sy = sy < 0 ? 0 : sy;
-    fy = fy < 0 ? 0 : fy;
-
-    fy = (fy + (1<< 4)) >> 5;
-    sx += (*xOffset);
-    sy += (*yOffset);
-    int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);
-    int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);
-    int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);
-
-    vxc_uchar16 Y, U, V;
-    vxc_int4 C0, C1, C2, C3;
-    vxc_uchar16 R, G, B;
-
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.x + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.x + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    srcPos.x = sx.y;
-    srcPos1.x = sx.y >> 1;
-    srcPos2.x = sx.y >> 1;
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.y + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.y + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
-
-    srcPos.x = sx.z;
-    srcPos1.x = sx.z >> 1;
-    srcPos2.x = sx.z >> 1;
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.z + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.z + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));
-
-    srcPos.x = sx.w;
-    srcPos1.x = sx.w >> 1;
-    srcPos2.x = sx.w >> 1;
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.w + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.w + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));
-
-    //C = Y - 16; D = U - 128; E = V - 128;
-    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]
-    int tmpV = -56992;
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);
-    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-
-    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]
-    // 298Y - 208V
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);
-    // 34784 - 100U
-    ushort tmpG = 34784;
-    vxc_ushort8 tmpDstG, tmpDstG1;
-    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);
-    VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);
-    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);
-    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);
-    VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);
-    VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);
-
-    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);
-    tmpV = -70688;
-    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-
-    int4 result, temp1, temp2;
-    int4 tmpData0, tmpData1;
-
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
-    temp1 = fx * tmpData0 + tmpData1;
-    // temp2 - temp1
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
-    temp2 = fx * tmpData0 + tmpData1;
-    result = fy * temp2 + (temp1 << 10);
-
-    vxc_half8 tmpVal;
-    half4 hDst;
-    tmpV = 1 << 19;
-    vxc_short8 dst;
-    float4 tmpDst;
-    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    tmpDst = (tmpDst - bMean) * var;
-    dstPos.z = bOrder;
-    _viv_asm(CONV, hDst, tmpDst);
-    VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
-    _viv_asm(COPY, dst, tmpVal, 16);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
-    temp1 = fx * tmpData0 + tmpData1;
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
-    temp2 = fx * tmpData0 + tmpData1;
-    result = fy * temp2 + (temp1 << 10);
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    tmpDst = (tmpDst - gMean) * var;
-    dstPos.z = 1;
-    _viv_asm(CONV, hDst, tmpDst);
-    VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
-    _viv_asm(COPY, dst, tmpVal, 16);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
-    temp1 = fx * tmpData0 + tmpData1;
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
-    temp2 = fx * tmpData0 + tmpData1;
-    result = fy * temp2 + (temp1 << 10);
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    tmpDst = (tmpDst - rMean) * var;
-    dstPos.z = rOrder;
-    _viv_asm(CONV, hDst, tmpDst);
-    VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);
-    _viv_asm(COPY, dst, tmpVal, 16);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_i16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_i16.vx
deleted file mode 100644
index 8bc4c0b..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_i16.vx
+++ /dev/null
@@ -1,227 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniCalculateR1st_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;
-
-_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-_viv_uniform VXC_512Bits uniDescaleU8_4x4;
-
-_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;
-_viv_uniform VXC_512Bits uniCalculateGWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;
-
-_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;
-
-_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;
-
-_viv_uniform int bOrder;
-_viv_uniform int rOrder;
-_viv_uniform float outputScale;
-
-__kernel void pre_process_yuv420_scale_U8toI16(
-    __read_only image2d_array_t y_img, __read_only image2d_array_t u_img,
-    __read_only image2d_array_t v_img, __write_only image2d_array_t    output,
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
-{
-    int4 gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    gidx += (int4)(0, 1, 2, 3);
-
-    int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);
-    int4 sx = fx & 0xffff8000; // Floor
-    int fy, sy;
-    fx -= sx;
-    sx = sx >> 15;
-    fx = (fx +(1 << 4)) >> 5;
-
-    // for y
-    fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);
-    sy = fy & 0xffff8000; // Floor
-    fy -= sy;
-    sy = sy >> 15;
-
-    sy = sy < 0 ? 0 : sy;
-    fy = fy < 0 ? 0 : fy;
-
-    fy = (fy + (1<< 4)) >> 5;
-    sx += (*xOffset);
-    sy += (*yOffset);
-    int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);
-    int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);
-    int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);
-
-    vxc_uchar16 Y, U, V;
-    vxc_int4 C0, C1, C2, C3;
-    vxc_uchar16 R, G, B;
-
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.x + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.x + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    srcPos.x = sx.y;
-    srcPos1.x = sx.y >> 1;
-    srcPos2.x = sx.y >> 1;
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.y + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.y + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
-
-    srcPos.x = sx.z;
-    srcPos1.x = sx.z >> 1;
-    srcPos2.x = sx.z >> 1;
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.z + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.z + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));
-
-    srcPos.x = sx.w;
-    srcPos1.x = sx.w >> 1;
-    srcPos2.x = sx.w >> 1;
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.w + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.w + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));
-
-    //C = Y - 16; D = U - 128; E = V - 128;
-    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]
-    int tmpV = -56992;
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);
-    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-
-    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]
-    // 298Y - 208V
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);
-    // 34784 - 100U
-    ushort tmpG = 34784;
-    vxc_ushort8 tmpDstG, tmpDstG1;
-    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);
-    VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);
-    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);
-    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);
-    VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);
-    VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);
-
-    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);
-    tmpV = -70688;
-    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-
-    int4 result, temp1, temp2;
-    int4 tmpData0, tmpData1;
-
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
-    temp1 = fx * tmpData0 + tmpData1;
-    // temp2 - temp1
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
-    temp2 = fx * tmpData0 + tmpData1;
-    result = fy * temp2 + (temp1 << 10);
-
-    tmpV = 1 << 19;
-    vxc_short8 dst;
-    float4 tmpDst;
-    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    tmpDst = (tmpDst - bMean) * var;
-    dstPos.z = bOrder;
-    result = convert_int4_rte(tmpDst * outputScale);
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
-    temp1 = fx * tmpData0 + tmpData1;
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
-    temp2 = fx * tmpData0 + tmpData1;
-    result = fy * temp2 + (temp1 << 10);
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    tmpDst = (tmpDst - gMean) * var;
-    dstPos.z = 1;
-    result = convert_int4_rte(tmpDst * outputScale);
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
-    temp1 = fx * tmpData0 + tmpData1;
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
-    temp2 = fx * tmpData0 + tmpData1;
-    result = fy * temp2 + (temp1 << 10);
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    tmpDst = (tmpDst - rMean) * var;
-    dstPos.z = rOrder;
-    result = convert_int4_rte(tmpDst * outputScale);
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_i8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_i8.vx
deleted file mode 100644
index d3150b0..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_i8.vx
+++ /dev/null
@@ -1,227 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniCalculateR1st_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;
-
-_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-_viv_uniform VXC_512Bits uniDescaleU8_4x4;
-
-_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;
-_viv_uniform VXC_512Bits uniCalculateGWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;
-
-_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;
-
-_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;
-
-_viv_uniform int bOrder;
-_viv_uniform int rOrder;
-_viv_uniform float outputScale;
-
-__kernel void pre_process_yuv420_scale_U8toI8(
-    __read_only image2d_array_t y_img, __read_only image2d_array_t u_img,
-    __read_only image2d_array_t v_img, __write_only image2d_array_t    output,
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
-{
-    int4 gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    gidx += (int4)(0, 1, 2, 3);
-
-    int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);
-    int4 sx = fx & 0xffff8000; // Floor
-    int fy, sy;
-    fx -= sx;
-    sx = sx >> 15;
-    fx = (fx +(1 << 4)) >> 5;
-
-    // for y
-    fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);
-    sy = fy & 0xffff8000; // Floor
-    fy -= sy;
-    sy = sy >> 15;
-
-    sy = sy < 0 ? 0 : sy;
-    fy = fy < 0 ? 0 : fy;
-
-    fy = (fy + (1<< 4)) >> 5;
-    sx += (*xOffset);
-    sy += (*yOffset);
-    int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);
-    int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);
-    int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);
-
-    vxc_uchar16 Y, U, V;
-    vxc_int4 C0, C1, C2, C3;
-    vxc_uchar16 R, G, B;
-
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.x + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.x + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    srcPos.x = sx.y;
-    srcPos1.x = sx.y >> 1;
-    srcPos2.x = sx.y >> 1;
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.y + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.y + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
-
-    srcPos.x = sx.z;
-    srcPos1.x = sx.z >> 1;
-    srcPos2.x = sx.z >> 1;
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.z + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.z + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));
-
-    srcPos.x = sx.w;
-    srcPos1.x = sx.w >> 1;
-    srcPos2.x = sx.w >> 1;
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.w + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.w + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));
-
-    //C = Y - 16; D = U - 128; E = V - 128;
-    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]
-    int tmpV = -56992;
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);
-    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-
-    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]
-    // 298Y - 208V
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);
-    // 34784 - 100U
-    ushort tmpG = 34784;
-    vxc_ushort8 tmpDstG, tmpDstG1;
-    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);
-    VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);
-    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);
-    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);
-    VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);
-    VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);
-
-    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);
-    tmpV = -70688;
-    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-
-    int4 result, temp1, temp2;
-    int4 tmpData0, tmpData1;
-
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
-    temp1 = fx * tmpData0 + tmpData1;
-    // temp2 - temp1
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
-    temp2 = fx * tmpData0 + tmpData1;
-    result = fy * temp2 + (temp1 << 10);
-
-    tmpV = 1 << 19;
-    vxc_char8 dst;
-    float4 tmpDst;
-    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    tmpDst = (tmpDst - bMean) * var;
-    dstPos.z = bOrder;
-    result = convert_int4_rte(tmpDst * outputScale);
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
-    temp1 = fx * tmpData0 + tmpData1;
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
-    temp2 = fx * tmpData0 + tmpData1;
-    result = fy * temp2 + (temp1 << 10);
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    tmpDst = (tmpDst - gMean) * var;
-    dstPos.z = 1;
-    result = convert_int4_rte(tmpDst * outputScale);
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
-    temp1 = fx * tmpData0 + tmpData1;
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
-    temp2 = fx * tmpData0 + tmpData1;
-    result = fy * temp2 + (temp1 << 10);
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    tmpDst = (tmpDst - rMean) * var;
-    dstPos.z = rOrder;
-    result = convert_int4_rte(tmpDst * outputScale);
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_u8.vx
deleted file mode 100644
index 6a0340b..0000000
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv420_scale_u8.vx
+++ /dev/null
@@ -1,228 +0,0 @@
-#include "cl_viv_vx_ext.h"
-
-_viv_uniform VXC_512Bits uniCalculateR1st_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;
-
-_viv_uniform VXC_512Bits uniCalculateB1st_4x4;
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
-_viv_uniform VXC_512Bits uniDescaleU8_4x4;
-
-_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;
-_viv_uniform VXC_512Bits uniCalculateGWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;
-
-_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;
-_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;
-
-_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;
-_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;
-
-_viv_uniform int bOrder;
-_viv_uniform int rOrder;
-_viv_uniform int zp;
-_viv_uniform float outputScale;
-
-__kernel void pre_process_yuv420_scale_U8toU8(
-    __read_only image2d_array_t y_img, __read_only image2d_array_t u_img,
-    __read_only image2d_array_t v_img, __write_only image2d_array_t    output,
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)
-{
-    int4 gidx = get_global_id(0);
-    int gidy = get_global_id(1);
-    gidx += (int4)(0, 1, 2, 3);
-
-    int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);
-    int4 sx = fx & 0xffff8000; // Floor
-    int fy, sy;
-    fx -= sx;
-    sx = sx >> 15;
-    fx = (fx +(1 << 4)) >> 5;
-
-    // for y
-    fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);
-    sy = fy & 0xffff8000; // Floor
-    fy -= sy;
-    sy = sy >> 15;
-
-    sy = sy < 0 ? 0 : sy;
-    fy = fy < 0 ? 0 : fy;
-
-    fy = (fy + (1<< 4)) >> 5;
-    sx += (*xOffset);
-    sy += (*yOffset);
-    int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);
-    int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);
-    int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);
-
-    vxc_uchar16 Y, U, V;
-    vxc_int4 C0, C1, C2, C3;
-    vxc_uchar16 R, G, B;
-
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.x + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.x + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));
-
-    srcPos.x = sx.y;
-    srcPos1.x = sx.y >> 1;
-    srcPos2.x = sx.y >> 1;
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.y + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.y + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));
-
-    srcPos.x = sx.z;
-    srcPos1.x = sx.z >> 1;
-    srcPos2.x = sx.z >> 1;
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.z + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.z + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));
-
-    srcPos.x = sx.w;
-    srcPos1.x = sx.w >> 1;
-    srcPos2.x = sx.w >> 1;
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));
-    srcPos1.x = (sx.w + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));
-
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));
-    srcPos2.x = (sx.w + 1) >> 1;
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));
-
-    //C = Y - 16; D = U - 128; E = V - 128;
-    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]
-    int tmpV = -56992;
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);
-    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);
-
-    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]
-    // 298Y - 208V
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);
-    // 34784 - 100U
-    ushort tmpG = 34784;
-    vxc_ushort8 tmpDstG, tmpDstG1;
-    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);
-    VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);
-    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);
-    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);
-    VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);
-    VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);
-
-    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);
-    tmpV = -70688;
-    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);
-
-    int4 result, temp1, temp2;
-    int4 tmpData0, tmpData1;
-
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
-    temp1 = fx * tmpData0 + tmpData1;
-    // temp2 - temp1
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
-    temp2 = fx * tmpData0 + tmpData1;
-    result = fy * temp2 + (temp1 << 10);
-
-    tmpV = 1 << 19;
-    vxc_uchar8 dst;
-    float4 tmpDst;
-    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    tmpDst = (tmpDst - bMean) * var;
-    dstPos.z = bOrder;
-    result = convert_int4_rte(tmpDst * outputScale + zp);
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
-    temp1 = fx * tmpData0 + tmpData1;
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
-    temp2 = fx * tmpData0 + tmpData1;
-    result = fy * temp2 + (temp1 << 10);
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    tmpDst = (tmpDst - gMean) * var;
-    dstPos.z = 1;
-    result = convert_int4_rte(tmpDst * outputScale + zp);
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);
-    temp1 = fx * tmpData0 + tmpData1;
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);
-    temp2 = fx * tmpData0 + tmpData1;
-    result = fy * temp2 + (temp1 << 10);
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);
-    tmpDst = (tmpDst - rMean) * var;
-    dstPos.z = rOrder;
-    result = convert_int4_rte(tmpDst * outputScale + zp);
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));
-}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx
new file mode 100644
index 0000000..f63e65c
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_copy.vx
@@ -0,0 +1,88 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int bOrder;
+_viv_uniform int rOrder;
+
+_viv_uniform float outputScaleVar;
+_viv_uniform float bMeanScaleVarZp;
+_viv_uniform float gMeanScaleVarZp;
+_viv_uniform float rMeanScaleVarZp;
+
+_viv_uniform VXC_512Bits uniConvertYUV422toB_4x4;
+_viv_uniform VXC_512Bits uniConvertYUV422toG_4x4;
+_viv_uniform VXC_512Bits uniConvertYUV422toR_4x4;
+
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;
+
+#define YUV422_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \
+__kernel void pre_process_yuv422_copy_##name \
+    ( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+    global       int*            xRatio, \
+    global       int*            yRatio, \
+    global       int*            xOffset, \
+    global       int*            yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           var, \
+                 int             reverse_channel, \
+                 int             trans, \
+                 int             yuv422_type \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+ \
+    int sy = gidy + (*yOffset); \
+    int sx = gidx + (*xOffset * 2); \
+ \
+    vxc_uchar8 YUV; \
+    vxc_short8 tmpYUV; \
+ \
+    VXC_ReadImage(YUV, input, (int2)(sx,sy), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    if (yuv422_type == 1) \
+    { \
+        YUV.s01234567 = YUV.s10325476; \
+    } \
+\
+    short tmpVal = 128; \
+    VXC_DP2x8(tmpYUV, YUV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \
+ \
+    float4 tmpDstB, tmpDstG, tmpDstR; \
+    VXC_DP4x4(tmpDstB, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \
+    VXC_DP4x4(tmpDstG, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \
+    VXC_DP4x4(tmpDstR, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \
+ \
+    conv_type result; \
+    dst_type dst0; \
+    save_type dst; \
+    int4 dstPos = (int4)(gidx, gidy, 0, 0); \
+    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstB); \
+    dstPos.z = bOrder; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstG); \
+    dstPos.z = 1; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstR); \
+    dstPos.z = rOrder; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+YUV422_COPY_SH_IMPL(U8toU8,  vxc_uchar4, int4,  vxc_uchar4, 4)
+YUV422_COPY_SH_IMPL(U8toI8,  vxc_char4,  int4,  vxc_char4,  4)
+YUV422_COPY_SH_IMPL(U8toI16, vxc_short4, int4,  vxc_short4, 8)
+YUV422_COPY_SH_IMPL(U8toF16, vxc_half4,  half4, vxc_short4, 8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx
new file mode 100644
index 0000000..ff85f8e
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_yuv422_scale.vx
@@ -0,0 +1,132 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int bOrder;
+_viv_uniform int rOrder;
+
+_viv_uniform float outputScaleVar;
+_viv_uniform float bMeanScaleVarZp;
+_viv_uniform float gMeanScaleVarZp;
+_viv_uniform float rMeanScaleVarZp;
+
+_viv_uniform uint  xrIntFloat_16;
+_viv_uniform uint  yrIntFloat_16;
+
+_viv_uniform VXC_512Bits uniConvertYUV422toB_4x4;
+_viv_uniform VXC_512Bits uniConvertYUV422toG_4x4;
+_viv_uniform VXC_512Bits uniConvertYUV422toR_4x4;
+
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;
+
+#define uyvy422 1
+
+#define YUV422_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \
+__kernel void pre_process_yuv422_scale_##name \
+    ( \
+    __read_only  image2d_array_t input, \
+    __write_only image2d_array_t output, \
+    global       int*            xRatio, \
+    global       int*            yRatio, \
+    global       int*            xOffset, \
+    global       int*            yOffset, \
+                 float           rMean, \
+                 float           gMean, \
+                 float           bMean, \
+                 float           var, \
+                 int             reverse_channel, \
+                 int             trans, \
+                 int             yuv422_type \
+    ) \
+{ \
+    int4 gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+    gidx += (int4)(0, 1, 2, 3); \
+ \
+    uint dy = (convert_uint(gidy) * yrIntFloat_16) >> 16; \
+    uint4 dx = (convert_uint4(gidx) * xrIntFloat_16) >> 16; \
+    int sy = convert_int(dy) + (*yOffset); \
+    int4 sx = convert_int4(dx)+ (*xOffset * 2); \
+ \
+    vxc_uchar4 Y; \
+    vxc_uchar8 UV; \
+    vxc_char8 tmpUV; \
+    short tmpVal = 128; \
+    int y_offset = 0; \
+    int u_offset = 1; \
+    int v_offset = 3; \
+\
+    if (yuv422_type == uyvy422) \
+    { \
+        y_offset = 1; \
+        u_offset = 0; \
+        v_offset = 2; \
+    } \
+\
+    int4 coord_Y = (int4)(sx.x * 2 + y_offset, sy, 0, 0); \
+    int4 coord_U = (int4)((sx.x >> 1) * 4 + u_offset, sy, 0, 0); \
+    int4 coord_V = (int4)((sx.x >> 1) * 4 + v_offset, sy, 0, 0); \
+\
+    VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    coord_Y.x = sx.y * 2 + y_offset; \
+    VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_Y.x = sx.z * 2 + y_offset; \
+    VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    coord_Y.x = sx.w * 2 + y_offset; \
+    VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    sx = (sx >> 1) * 4 + u_offset; \
+    VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \
+    coord_U.x = sx.y; \
+    VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_U.x = sx.z; \
+    VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \
+    coord_U.x = sx.w; \
+    VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \
+\
+    sx = sx - u_offset + v_offset; \
+    VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \
+    coord_V.x = sx.y; \
+    VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_V.x = sx.z; \
+    VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \
+    coord_V.x = sx.w; \
+    VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \
+    vxc_uchar4 dst_test; \
+    VXC_DP2x8(dst_test, dx, dx, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
+\
+    float4 tmpDstB, tmpDstG, tmpDstR; \
+    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \
+    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \
+    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \
+ \
+    conv_type result; \
+    dst_type dst0; \
+    save_type dst; \
+    int4 dstPos = (int4)(gidx.x, gidy, 0, 0); \
+    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstB); \
+    dstPos.z = bOrder; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstG); \
+    dstPos.z = 1; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+ \
+    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \
+    _viv_asm(CONV_RTE, result, tmpDstR); \
+    dstPos.z = rOrder; \
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \
+    _viv_asm(COPY, dst, dst0, copy_bytes); \
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+
+YUV422_SH_IMPL(U8toU8,  vxc_uchar4, int4,  vxc_uchar4, 4)
+YUV422_SH_IMPL(U8toI8,  vxc_char4,  int4,  vxc_char4,  4)
+YUV422_SH_IMPL(U8toI16, vxc_short4, int4,  vxc_short4, 8)
+YUV422_SH_IMPL(U8toF16, vxc_half4,  half4, vxc_short4, 8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/select.vx b/src/tim/vx/internal/src/libnnext/ops/vx/select.vx
index ce788a4..5f72ad1 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/select.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/select.vx
@@ -12,15 +12,15 @@ _viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
     vxc_ushort8 mp0, mp1; \
     _viv_asm(COPY, mp0, multAndoutZP0, 16); \
     _viv_asm(COPY, mp1, multAndoutZP1, 16); \
-    read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \
+    read_fun(src0, input0, coord, 0, \
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
-    read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \
+    read_fun(src1, input1, coord, 0, \
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
     VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
              uniU8MulAndPostShift0_Lo_2x8); \
     VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
              uniU8MulAndPostShift1_Lo_2x8); \
-    read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \
+    read_fun(value_tmp, condition, coord, 0, \
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
     VXC_DP2x8(value, value_tmp, value_tmp,\
              VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \
@@ -60,11 +60,11 @@ SELECT_INT_FUN_2D(I8, I16, I16, vxc_short8)
 #define SELECT_HALF(read_fun, write_fun) \
     vxc_short8 src0, src1, dst, value; \
     vxc_char8 value_tmp; \
-    read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \
+    read_fun(src0, input0, coord, 0, \
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
-    read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \
+    read_fun(src1, input1, coord, 0, \
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
-    read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \
+    read_fun(value_tmp, condition, coord, 0, \
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
     VXC_DP2x8(value, value_tmp, value_tmp,\
              VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \
@@ -91,37 +91,36 @@ __kernel void select_I8_F16_F16toF16_2D(
     SELECT_HALF(VXC_ReadImage, VXC_WriteImage)
 }
 
-#define SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, read_fun, write_fun) \
-    vxc_short8 src0, src1, dst, value; \
-    vxc_half8 value0, value1; \
-    src0_type r0; \
-    src1_type r1; \
+#define SELECT_HYBRID(src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type, read_fun, write_fun) \
+    save_type dst, value; \
+    save_type dst0, dst1; \
+    dst_type value0, value1; \
+    src0_type src0; \
+    src1_type src1; \
     copy0_type v0; \
     copy1_type v1; \
     vxc_char8 value_tmp; \
     vxc_ushort8 mp0, mp1; \
     _viv_asm(COPY, mp0, multAndoutZP0, 16); \
     _viv_asm(COPY, mp1, multAndoutZP1, 16); \
-    read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    read_fun(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
     _viv_asm(COPY, v0, src0, 16); \
-    read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    read_fun(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
     _viv_asm(COPY, v1, src1, 16); \
     VXC_DP2x8(value0, v0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
              uniU8MulAndPostShift0_Lo_2x8); \
-    _viv_asm(COPY, src0, value0, 16); \
+    _viv_asm(COPY, dst0, value0, 16); \
     VXC_DP2x8(value1, v1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
              uniU8MulAndPostShift1_Lo_2x8); \
-    _viv_asm(COPY, src1, value1, 16); \
-    read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \
+    _viv_asm(COPY, dst1, value1, 16); \
+    read_fun(value_tmp, condition, coord, 0, \
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
     VXC_DP2x8(value, value_tmp, value_tmp,\
              VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \
-    dst = (value != 0 ? src0 : src1); \
+    dst = (value != 0 ? dst0 : dst1); \
     write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 
-#define SELECT_HYBRID_TOF16_FUN(name, src0_type, copy0_type, src1_type, copy1_type) \
+#define SELECT_HYBRID_FUN(name, src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type) \
 __kernel void select_##name( \
     __read_only  image2d_array_t   condition, \
     __read_only  image2d_array_t   input0, \
@@ -129,44 +128,62 @@ __kernel void select_##name( \
     __write_only image2d_array_t   output) \
 { \
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
-    SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, \
+    SELECT_HYBRID(src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type,\
             VXC_ReadImage2DArray, VXC_WriteImage2DArray) \
 }
-SELECT_HYBRID_TOF16_FUN(I8_F16_U8toF16,  vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16)
-SELECT_HYBRID_TOF16_FUN(I8_U8_F16toF16,  vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8)
-SELECT_HYBRID_TOF16_FUN(I8_F16_I8toF16,  vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16)
-SELECT_HYBRID_TOF16_FUN(I8_I8_F16toF16,  vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8)
-SELECT_HYBRID_TOF16_FUN(I8_F16_I16toF16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8)
-SELECT_HYBRID_TOF16_FUN(I8_I16_F16toF16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8)
+SELECT_HYBRID_FUN(I8_F16_U8toF16,  vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, vxc_half8,  vxc_short8)
+SELECT_HYBRID_FUN(I8_U8_F16toF16,  vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8,   vxc_half8,  vxc_short8)
+SELECT_HYBRID_FUN(I8_F16_I8toF16,  vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  vxc_half8,  vxc_short8)
+SELECT_HYBRID_FUN(I8_I8_F16toF16,  vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8,   vxc_half8,  vxc_short8)
+SELECT_HYBRID_FUN(I8_F16_I16toF16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  vxc_half8,  vxc_short8)
+SELECT_HYBRID_FUN(I8_I16_F16toF16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,   vxc_half8,  vxc_short8)
+SELECT_HYBRID_FUN(I8_F16_U8toU8,   vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, vxc_uchar8, vxc_uchar8)
+SELECT_HYBRID_FUN(I8_U8_F16toU8,   vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8,   vxc_uchar8, vxc_uchar8)
+SELECT_HYBRID_FUN(I8_F16_I8toI8,   vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  vxc_char8,  vxc_char8)
+SELECT_HYBRID_FUN(I8_I8_F16toI8,   vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8,   vxc_char8,  vxc_char8)
+SELECT_HYBRID_FUN(I8_F16_I16toI16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  vxc_short8, vxc_short8)
+SELECT_HYBRID_FUN(I8_I16_F16toI16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,   vxc_short8, vxc_short8)
+SELECT_HYBRID_FUN(I8_U8_U8toF16,   vxc_uchar8,  vxc_uchar8,  vxc_uchar8,  vxc_uchar8,  vxc_half8,  vxc_short8)
+SELECT_HYBRID_FUN(I8_I8_I8toF16,   vxc_char8,   vxc_char8,   vxc_char8,   vxc_char8,   vxc_half8,  vxc_short8)
+SELECT_HYBRID_FUN(I8_I16_I16toF16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,  vxc_short8)
 
-#define SELECT_HYBRID_TOF16_FUN_2D(name, src0_type, copy0_type, src1_type, copy1_type) \
-__kernel void select_##name( \
+#define SELECT_HYBRID_FUN_2D(name, src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type) \
+__kernel void select_##name##_2D( \
     __read_only  image2d_array_t   condition, \
     __read_only  image2d_array_t   input0, \
     __read_only  image2d_array_t   input1, \
     __write_only image2d_array_t   output) \
 { \
     int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
-    SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, \
+    SELECT_HYBRID(src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type, \
             VXC_ReadImage, VXC_WriteImage) \
 }
-SELECT_HYBRID_TOF16_FUN_2D(I8_F16_U8toF16_2D,  vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16)
-SELECT_HYBRID_TOF16_FUN_2D(I8_U8_F16toF16_2D,  vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8)
-SELECT_HYBRID_TOF16_FUN_2D(I8_F16_I8toF16_2D,  vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16)
-SELECT_HYBRID_TOF16_FUN_2D(I8_I8_F16toF16_2D,  vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8)
-SELECT_HYBRID_TOF16_FUN_2D(I8_F16_I16toF16_2D, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8)
-SELECT_HYBRID_TOF16_FUN_2D(I8_I16_F16toF16_2D, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8)
+SELECT_HYBRID_FUN_2D(I8_F16_U8toF16,  vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, vxc_half8, vxc_short8)
+SELECT_HYBRID_FUN_2D(I8_U8_F16toF16,  vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8,   vxc_half8, vxc_short8)
+SELECT_HYBRID_FUN_2D(I8_F16_I8toF16,  vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  vxc_half8, vxc_short8)
+SELECT_HYBRID_FUN_2D(I8_I8_F16toF16,  vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8,   vxc_half8, vxc_short8)
+SELECT_HYBRID_FUN_2D(I8_F16_I16toF16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  vxc_half8, vxc_short8)
+SELECT_HYBRID_FUN_2D(I8_I16_F16toF16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,   vxc_half8, vxc_short8)
+SELECT_HYBRID_FUN_2D(I8_F16_U8toU8,   vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, vxc_uchar8, vxc_uchar8)
+SELECT_HYBRID_FUN_2D(I8_U8_F16toU8,   vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8,   vxc_uchar8, vxc_uchar8)
+SELECT_HYBRID_FUN_2D(I8_F16_I8toI8,   vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  vxc_char8,  vxc_char8)
+SELECT_HYBRID_FUN_2D(I8_I8_F16toI8,   vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8,   vxc_char8,  vxc_char8)
+SELECT_HYBRID_FUN_2D(I8_F16_I16toI16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  vxc_short8, vxc_short8)
+SELECT_HYBRID_FUN_2D(I8_I16_F16toI16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,   vxc_short8, vxc_short8)
+SELECT_HYBRID_FUN_2D(I8_U8_U8toF16,   vxc_uchar8,  vxc_uchar8,  vxc_uchar8,  vxc_uchar8,  vxc_half8,  vxc_short8)
+SELECT_HYBRID_FUN_2D(I8_I8_I8toF16,   vxc_char8,   vxc_char8,   vxc_char8,   vxc_char8,   vxc_half8,  vxc_short8)
+SELECT_HYBRID_FUN_2D(I8_I16_I16toF16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,  vxc_short8)
 
 #define SELECT_HALF_TO_QINT(read_fun, write_fun, dst_type) \
     vxc_short8 src0, src1, tmp_dst, value; \
     vxc_half8 data; \
     dst_type dst; \
     vxc_char8 value_tmp; \
-    read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \
+    read_fun(src0, input0, coord, 0, \
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
-    read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \
+    read_fun(src1, input1, coord, 0, \
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
-    read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \
+    read_fun(value_tmp, condition, coord, 0, \
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
     VXC_DP2x8(value, value_tmp, value_tmp,\
              VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \
diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
index 2aedbce..fe52f46 100644
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
@@ -2139,6 +2139,184 @@ BATCH_NORM_SH_IMPL_AXIS1_2D(I8,  I8,  vxc_char16,  vxc_char16,  int4,  vxc_char1
 BATCH_NORM_SH_IMPL_AXIS1_2D(I8,  F16, vxc_char16,  vxc_char16,  half4, vxc_half8,   vxc_ushort8)\n\
 "; /* end of batchnorm_single_f32_vx*/
 
+static const char bucketize_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniDataConvert_0_4x4;\n\
+_viv_uniform VXC_512Bits uniDataConvert_1_4x4;\n\
+_viv_uniform int boundaries_size_x8;\n\
+_viv_uniform int boundaries_size;\n\
+\n\
+#define BUCKETIZE_16BITS_SH_IMPL(name, copy_type) \\\n\
+__kernel void bucketize_right_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_t input, \\\n\
+    __read_only  image2d_t boundaries, \\\n\
+    __write_only image2d_t output \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\
+ \\\n\
+    vxc_short8 data0, data1; \\\n\
+    copy_type src0, src1, dst0, dst1; \\\n\
+    vxc_ushort8 v0, v1, v2, v3, result = 0; \\\n\
+    VXC_ReadImage(data0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src0, data0, 16); \\\n\
+ \\\n\
+    for (; coord.z < boundaries_size_x8; ) \\\n\
+    { \\\n\
+        VXC_ReadImage(data1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, src1, data1.s00000000, 16); \\\n\
+        coord.z += 8; \\\n\
+ \\\n\
+        VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\
+        _viv_asm(COPY, v0, dst0, 16); \\\n\
+        v2 = sub_sat(v0, 0xFFFE); \\\n\
+        _viv_asm(COPY, src1, data1.s11111111, 16); \\\n\
+        VXC_Clamp(dst1, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\
+        _viv_asm(COPY, v1, dst1, 16); \\\n\
+        v3 = sub_sat(v1, 0xFFFE); \\\n\
+ \\\n\
+        result = result + v2 + v3; \\\n\
+ \\\n\
+        _viv_asm(COPY, src1, data1.s22222222, 16); \\\n\
+        VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\
+        _viv_asm(COPY, v0, dst0, 16); \\\n\
+        v2 = sub_sat(v0, 0xFFFE); \\\n\
+        _viv_asm(COPY, src1, data1.s33333333, 16); \\\n\
+        VXC_Clamp(dst1, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\
+        _viv_asm(COPY, v1, dst1, 16); \\\n\
+        v3 = sub_sat(v1, 0xFFFE); \\\n\
+ \\\n\
+        result = result + v2 + v3; \\\n\
+ \\\n\
+        _viv_asm(COPY, src1, data1.s44444444, 16); \\\n\
+        VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\
+        _viv_asm(COPY, v0, dst0, 16); \\\n\
+        v2 = sub_sat(v0, 0xFFFE); \\\n\
+        _viv_asm(COPY, src1, data1.s55555555, 16); \\\n\
+        VXC_Clamp(dst1, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\
+        _viv_asm(COPY, v1, dst1, 16); \\\n\
+        v3 = sub_sat(v1, 0xFFFE); \\\n\
+ \\\n\
+        result = result + v2 + v3; \\\n\
+ \\\n\
+        _viv_asm(COPY, src1, data1.s66666666, 16); \\\n\
+        VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\
+        _viv_asm(COPY, v0, dst0, 16); \\\n\
+        v2 = sub_sat(v0, 0xFFFE); \\\n\
+        _viv_asm(COPY, src1, data1.s77777777, 16); \\\n\
+        VXC_Clamp(dst1, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\
+        _viv_asm(COPY, v1, dst1, 16); \\\n\
+        v3 = sub_sat(v1, 0xFFFE); \\\n\
+ \\\n\
+        result = result + v2 + v3; \\\n\
+    } \\\n\
+ \\\n\
+    for (; coord.z < boundaries_size; ) \\\n\
+    { \\\n\
+        VXC_ReadImage(data1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, src1, data1.s00000000, 16); \\\n\
+        coord.z ++; \\\n\
+ \\\n\
+        VXC_Clamp(dst0, src0, src1, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\
+        _viv_asm(COPY, v0, dst0, 16); \\\n\
+        v2 = sub_sat(v0, 0xFFFE); \\\n\
+ \\\n\
+        result = result + v2; \\\n\
+    } \\\n\
+ \\\n\
+    int4 d0, d1; \\\n\
+    VXC_DP4x4(d0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniDataConvert_0_4x4); \\\n\
+    VXC_DP4x4(d1, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniDataConvert_1_4x4); \\\n\
+    coord.z = coord.x + 4; \\\n\
+ \\\n\
+    write_imagei(output, coord.xy, d0); \\\n\
+    write_imagei(output, coord.zy, d1); \\\n\
+}\n\
+BUCKETIZE_16BITS_SH_IMPL(F16_F16toI32_2D, vxc_half8)\n\
+BUCKETIZE_16BITS_SH_IMPL(I16_I16toI32_2D, vxc_short8)\n\
+\n\
+#define BUCKETIZE_8BITS_SH_IMPL(name, src_type) \\\n\
+__kernel void bucketize_right_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_t input, \\\n\
+    __read_only  image2d_t boundaries, \\\n\
+    __write_only image2d_t output \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\
+ \\\n\
+    src_type src0, src1, src2; \\\n\
+    vxc_uchar8 dst0, dst1, result = 0; \\\n\
+    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(src1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    for (; coord.z < boundaries_size_x8; ) \\\n\
+    { \\\n\
+        VXC_ReadImage(src1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.z += 8; \\\n\
+ \\\n\
+        VXC_Clamp(src2, src0, src1.s00000000, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\
+        _viv_asm(COPY, dst0, src2, 8); \\\n\
+        dst0 = sub_sat(dst0, 0xFE); \\\n\
+        VXC_Clamp(src2, src0, src1.s11111111, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\
+        _viv_asm(COPY, dst1, src2, 8); \\\n\
+        dst1 = sub_sat(dst1, 0xFE); \\\n\
+ \\\n\
+        result = result + dst0 + dst1; \\\n\
+ \\\n\
+        VXC_Clamp(src2, src0, src1.s22222222, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\
+        _viv_asm(COPY, dst0, src2, 8); \\\n\
+        dst0 = sub_sat(dst0, 0xFE); \\\n\
+        VXC_Clamp(src2, src0, src1.s33333333, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\
+        _viv_asm(COPY, dst1, src2, 8); \\\n\
+        dst1 = sub_sat(dst1, 0xFE); \\\n\
+ \\\n\
+        result = result + dst0 + dst1; \\\n\
+ \\\n\
+        VXC_Clamp(src2, src0, src1.s44444444, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\
+        _viv_asm(COPY, dst0, src2, 8); \\\n\
+        dst0 = sub_sat(dst0, 0xFE); \\\n\
+        VXC_Clamp(src2, src0, src1.s55555555, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\
+        _viv_asm(COPY, dst1, src2, 8); \\\n\
+        dst1 = sub_sat(dst1, 0xFE); \\\n\
+ \\\n\
+        result = result + dst0 + dst1; \\\n\
+ \\\n\
+        VXC_Clamp(src2, src0, src1.s66666666, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\
+        _viv_asm(COPY, dst0, src2, 8); \\\n\
+        dst0 = sub_sat(dst0, 0xFE); \\\n\
+        VXC_Clamp(src2, src0, src1.s77777777, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\
+        _viv_asm(COPY, dst1, src2, 8); \\\n\
+        dst1 = sub_sat(dst1, 0xFE); \\\n\
+ \\\n\
+        result = result + dst0 + dst1; \\\n\
+    } \\\n\
+ \\\n\
+    for (; coord.z < boundaries_size; ) \\\n\
+    { \\\n\
+        VXC_ReadImage(src1, boundaries, coord.zw, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        coord.z ++; \\\n\
+ \\\n\
+        VXC_Clamp(src2, src0, src1.s00000000, src0, VXC_MODIFIER_CLAMP(0, 7, 0, 1)); \\\n\
+        _viv_asm(COPY, dst0, src2, 8); \\\n\
+        dst0 = sub_sat(dst0, 0xFE); \\\n\
+ \\\n\
+        result = result + dst0; \\\n\
+    } \\\n\
+ \\\n\
+    int4 d0, d1; \\\n\
+    VXC_DP4x4(d0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniDataConvert_0_4x4); \\\n\
+    VXC_DP4x4(d1, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniDataConvert_1_4x4); \\\n\
+    coord.z = coord.x + 4; \\\n\
+ \\\n\
+    write_imagei(output, coord.xy, d0); \\\n\
+    write_imagei(output, coord.zy, d1); \\\n\
+}\n\
+BUCKETIZE_8BITS_SH_IMPL(U8_U8toI32_2D, vxc_uchar8)\n\
+BUCKETIZE_8BITS_SH_IMPL(I8_I8toI32_2D, vxc_char8)\n\
+"; /* end of bucketize_vx*/
+
 static const char cast_vx[] = "\n\
 #include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -9461,7 +9639,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_##name#
     vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\
     half4 tmpVal0, tmpVal1; \\\n\
     float alpha = scale_vari; \\\n\
-    float alpha = scale_vari * input_scale; \\\n\
+    alpha = scale_vari * input_scale; \\\n\
     bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \\\n\
     bias_val = bias_val - input_zp * alpha; \\\n\
  \\\n\
@@ -11438,20 +11616,14 @@ __kernel void hswish_BF16toBF16_2D(\n\
 static const char instance_normalization_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int width;\n\
-_viv_uniform int height;\n\
-_viv_uniform float inv_multiplier;\n\
-_viv_uniform int group_num;\n\
 \n\
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
 _viv_uniform VXC_512Bits uniSum_X_X2_16x2;\n\
 _viv_uniform float input_scale;\n\
 _viv_uniform float input_scale2;\n\
-_viv_uniform float input_zp;\n\
 _viv_uniform float sum_x_tail;\n\
 _viv_uniform float sum_x2_tail0;\n\
 _viv_uniform float sum_x2_tail1;\n\
-_viv_uniform float output_scale;\n\
-_viv_uniform float output_zp;\n\
 \n\
 _viv_uniform VXC_512Bits uniSumX_16x1;\n\
 _viv_uniform VXC_512Bits uniSumX2_16x1;\n\
@@ -11460,7 +11632,7 @@ _viv_uniform VXC_512Bits uniSumX2_16x1;\n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name( \\\n\
     __read_only  image2d_array_t input, \\\n\
     __write_only image2d_array_t output, \\\n\
-                 float eps, int rs_flag) \\\n\
+                 float eps, int height) \\\n\
 { \\\n\
     int gidx = get_global_id(0) << 4; \\\n\
     int lidx = get_local_id(0); \\\n\
@@ -11518,7 +11690,7 @@ INSTANCE_NORM_SUMS_8BITS_IMPL(I8, vxc_char16)\n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name##_2D( \\\n\
     __read_only  image2d_array_t input, \\\n\
     __write_only image2d_array_t output, \\\n\
-                 float eps, int rs_flag) \\\n\
+                 float eps, int height) \\\n\
 { \\\n\
     int gidx = get_global_id(0) << 4; \\\n\
     int lidx = get_local_id(0); \\\n\
@@ -11571,18 +11743,62 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums
 INSTANCE_NORM_SUMS_8BITS_IMPL_2D(U8, vxc_uchar16)\n\
 INSTANCE_NORM_SUMS_8BITS_IMPL_2D(I8, vxc_char16)\n\
 \n\
+__kernel void instance_norm_means\n\
+(\n\
+    __read_only  image2d_t sums,\n\
+    __read_only  image2d_t bias,\n\
+    __read_only  image2d_t scale,\n\
+    __write_only image2d_t means,\n\
+                 float     eps,\n\
+                 float     in_time_out_scale,\n\
+                 float     input_zp,\n\
+                 float     output_scale,\n\
+                 float     output_zp,\n\
+                 float     inv_multiplier,\n\
+                 int       group_num\n\
+)\n\
+{\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
+\n\
+    Image sums_img = create_image_from_image2d(sums, 4);\n\
+    float4 *sums_ptr = (float4 *)get_image_ptr_from_coord(sums_img, coord);\n\
+\n\
+    float alpha = read_imagef(scale, coord).x;\n\
+    float beta = read_imagef(bias, coord).x;\n\
+\n\
+    float4 mean_var = sums_ptr[0];\n\
+    for(int i = 1; i < group_num;)\n\
+    {\n\
+        mean_var += sums_ptr[i];\n\
+        i ++;\n\
+    }\n\
+\n\
+    mean_var *= inv_multiplier;\n\
+    mean_var.s1 = mean_var.s1 - mean_var.s0 * mean_var.s0 + eps;\n\
+    mean_var.s1 = rsqrt(mean_var.s1);\n\
+\n\
+    alpha = alpha * mean_var.y;\n\
+\n\
+    float4 dst;\n\
+    dst.x = in_time_out_scale * alpha;\n\
+    beta = (beta - alpha * mean_var.x) * output_scale + output_zp;\n\
+    dst.y = beta - input_zp * dst.x;\n\
+\n\
+    Image means_img = create_image_from_image2d(means, 4);\n\
+    float4 *means_ptr = (float4 *)get_image_ptr_from_coord(means_img, coord);\n\
+    means_ptr[0] = dst.xyxy;\n\
+}\n\
+\n\
 _viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\
 _viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\
 _viv_uniform VXC_512Bits uniDataToFP32_2_4x4;\n\
 _viv_uniform VXC_512Bits uniDataToFP32_3_4x4;\n\
 #define INSTANCE_NORM_8BITS_IMPL(name, src_type, dst_type) \\\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \\\n\
+__kernel void instance_norm_##name( \\\n\
     __read_only  image2d_array_t input, \\\n\
-    __read_only  image2d_t       bias, \\\n\
-    __read_only  image2d_t       scale, \\\n\
-    __read_only  image2d_t       meanVari, \\\n\
+    __read_only  image2d_t       means, \\\n\
     __write_only image2d_array_t output, \\\n\
-                 float eps, int rs_flag) \\\n\
+                 int height) \\\n\
 { \\\n\
     int gidz = get_global_id(1); \\\n\
     int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\
@@ -11590,26 +11806,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
     int2 coord_para = (int2)(0, gidz); \\\n\
     src_type src0; \\\n\
     dst_type dst; \\\n\
-    float scale_vari, bias_val; \\\n\
-    float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\
+    float4 coef; \\\n\
  \\\n\
-    scale_f = read_imagef(scale, coord_para); \\\n\
-    bias_f = read_imagef(bias, coord_para); \\\n\
-    for(int i = 0; i < group_num; i++) \\\n\
-    { \\\n\
-        mean_vari += read_imagef(meanVari, coord_para); \\\n\
-        coord_para.x += 4; \\\n\
-    } \\\n\
-    mean_vari *= inv_multiplier; \\\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\
- \\\n\
-    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
-    vxc_int4 tmpVal0, tmpVal1; \\\n\
+    coef = read_imagef(means, coord_para); \\\n\
+    int4 tmpVal0, tmpVal1; \\\n\
     float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\
-    float alpha = input_scale * output_scale * scale_vari; \\\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\
-    bias_val = bias_val - input_zp * alpha; \\\n\
  \\\n\
     int8 input_desc, output_desc; \\\n\
     _viv_asm(COPY, input_desc, input, sizeof(input_desc)); \\\n\
@@ -11628,14 +11829,14 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
     VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\
     VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\
     VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\
-    norm = tmpData0 * alpha + bias_val; \\\n\
+    norm = tmpData0 * coef.x + coef.y; \\\n\
     tmpVal0 = convert_int4_rte(norm); \\\n\
-    norm = tmpData1 * alpha + bias_val; \\\n\
+    norm = tmpData1 * coef.x + coef.y; \\\n\
     tmpVal1 = convert_int4_rte(norm); \\\n\
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
-    norm = tmpData2 * alpha + bias_val; \\\n\
+    norm = tmpData2 * coef.x + coef.y; \\\n\
     tmpVal0 = convert_int4_rte(norm); \\\n\
-    norm = tmpData3 * alpha + bias_val; \\\n\
+    norm = tmpData3 * coef.x + coef.y; \\\n\
     tmpVal1 = convert_int4_rte(norm); \\\n\
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
     VXC_OP4_NoDest(img_store_3d, output, coord, dst, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0)); \\\n\
@@ -11645,60 +11846,46 @@ INSTANCE_NORM_8BITS_IMPL(U8_F32toU8, vxc_uchar16, vxc_uchar16)\n\
 INSTANCE_NORM_8BITS_IMPL(I8_F32toI8, vxc_char16,  vxc_char16)\n\
 \n\
 #define INSTANCE_NORM_8BITS_IMPL_2D(name, src_type, dst_type) \\\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \\\n\
+__kernel void instance_norm_##name##_2D( \\\n\
     __read_only  image2d_array_t input, \\\n\
-    __read_only  image2d_t       bias, \\\n\
-    __read_only  image2d_t       scale, \\\n\
-    __read_only  image2d_t       meanVari, \\\n\
+    __read_only  image2d_t       means, \\\n\
     __write_only image2d_array_t output, \\\n\
-                 float eps, int rs_flag) \\\n\
+                 int height) \\\n\
 { \\\n\
     int gidz = get_global_id(1); \\\n\
     int gidy = gidz * height; \\\n\
-    int2 coord = (int2)(get_global_id(0), gidy); \\\n\
+    int4 coord; \\\n\
     int2 coord_para = (int2)(0, gidz); \\\n\
     int endH = gidy + height; \\\n\
     src_type src0; \\\n\
     dst_type dst; \\\n\
-    float scale_vari, bias_val; \\\n\
-    float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\
+    float4 coef; \\\n\
  \\\n\
-    scale_f = read_imagef(scale, coord_para); \\\n\
-    bias_f = read_imagef(bias, coord_para); \\\n\
-    for(int i = 0; i < group_num; i++) \\\n\
-    { \\\n\
-        mean_vari += read_imagef(meanVari, coord_para); \\\n\
-        coord_para.x += 4; \\\n\
-    } \\\n\
-    mean_vari *= inv_multiplier; \\\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\
- \\\n\
-    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
-    vxc_int4 tmpVal0, tmpVal1; \\\n\
+    coef = read_imagef(means, coord_para); \\\n\
+    int4 tmpVal0, tmpVal1; \\\n\
     float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\
-    float alpha = input_scale * output_scale * scale_vari; \\\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\
-    bias_val = bias_val - input_zp * alpha; \\\n\
  \\\n\
-    for(; coord.y < endH; coord.y++) \\\n\
+    coord = (int4)(get_global_id(0), gidy, gidy - 1, gidy - 1); \\\n\
+ \\\n\
+    for(; coord.y < endH; ) \\\n\
     { \\\n\
-    VXC_ReadImage(src0, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord.yz++; \\\n\
     VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\
     VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\
     VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\
     VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\
-    norm = tmpData0 * alpha + bias_val; \\\n\
+    norm = tmpData0 * coef.x + coef.y; \\\n\
     tmpVal0 = convert_int4_rte(norm); \\\n\
-    norm = tmpData1 * alpha + bias_val; \\\n\
+    norm = tmpData1 * coef.x + coef.y; \\\n\
     tmpVal1 = convert_int4_rte(norm); \\\n\
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
-    norm = tmpData2 * alpha + bias_val; \\\n\
+    norm = tmpData2 * coef.x + coef.y; \\\n\
     tmpVal0 = convert_int4_rte(norm); \\\n\
-    norm = tmpData3 * alpha + bias_val; \\\n\
+    norm = tmpData3 * coef.x + coef.y; \\\n\
     tmpVal1 = convert_int4_rte(norm); \\\n\
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
-    VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_WriteImage(output, coord.xz, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
     } \\\n\
 }\n\
 INSTANCE_NORM_8BITS_IMPL_2D(U8_F32toU8, vxc_uchar16, vxc_uchar16)\n\
@@ -11706,12 +11893,6 @@ INSTANCE_NORM_8BITS_IMPL_2D(I8_F32toI8, vxc_char16,  vxc_char16)"; /* end of ins
 
 static const char instance_normalization_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
-_viv_uniform int width;\n\
-_viv_uniform int height;\n\
-_viv_uniform float inv_multiplier;\n\
-_viv_uniform int group_num;\n\
-_viv_uniform float input_scale;\n\
-_viv_uniform float input_zp;\n\
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
 \n\
 _viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\
@@ -11720,13 +11901,11 @@ _viv_uniform VXC_512Bits uniDataToFP32_2_4x4;\n\
 _viv_uniform VXC_512Bits uniDataToFP32_3_4x4;\n\
 \n\
 #define INSTANCE_NORM_8_TO_F16_IMPL(name, src_type) \\\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \\\n\
+__kernel void instance_norm_##name( \\\n\
     __read_only  image2d_array_t input, \\\n\
-    __read_only  image2d_t       bias, \\\n\
-    __read_only  image2d_t       scale, \\\n\
-    __read_only  image2d_t       meanVari, \\\n\
+    __read_only  image2d_t       means, \\\n\
     __write_only image2d_array_t output, \\\n\
-                 float eps, int rs_flag) \\\n\
+                 int height) \\\n\
 { \\\n\
     int gidz = get_global_id(1); \\\n\
     int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\
@@ -11734,25 +11913,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
     src_type src0; \\\n\
     vxc_short8 outval; \\\n\
     vxc_half8 dst; \\\n\
-    float scale_vari, bias_val; \\\n\
-    float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\
+    float4 coef; \\\n\
  \\\n\
-    scale_f = read_imagef(scale, coord_para.xy); \\\n\
-    bias_f = read_imagef(bias, coord_para.xy); \\\n\
-    for(int i = 0; i < group_num; i++) \\\n\
-    { \\\n\
-        mean_vari += read_imagef(meanVari, coord_para.xy); \\\n\
-        coord_para.x += 4; \\\n\
-    } \\\n\
-    mean_vari *= inv_multiplier; \\\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\
-    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
+    coef = read_imagef(means, coord_para.xy); \\\n\
     float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\
     half4 tmpVal0, tmpVal1; \\\n\
-    float alpha = scale_vari * input_scale; \\\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \\\n\
-    bias_val = bias_val - input_zp * alpha; \\\n\
  \\\n\
     coord_para = coord; \\\n\
     int8 input_desc, output_desc; \\\n\
@@ -11773,17 +11938,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
     VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\
     VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\
     VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\
-    norm = alpha * tmpData0 + bias_val; \\\n\
+    norm = tmpData0 * coef.x + coef.y; \\\n\
     _viv_asm(CONV, tmpVal0, norm); \\\n\
-    norm = alpha * tmpData1 + bias_val; \\\n\
+    norm = tmpData1 * coef.x + coef.y; \\\n\
     _viv_asm(CONV, tmpVal1, norm); \\\n\
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
     _viv_asm(COPY, outval, dst, 16); \\\n\
     VXC_OP4_NoDest(img_store_3d, output, coord_para, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0)); \\\n\
     coord_para.x += 8; \\\n\
-    norm = alpha * tmpData2 + bias_val; \\\n\
+    norm = tmpData2 * coef.x + coef.y; \\\n\
     _viv_asm(CONV, tmpVal0, norm); \\\n\
-    norm = alpha * tmpData3 + bias_val; \\\n\
+    norm = tmpData3 * coef.x + coef.y; \\\n\
     _viv_asm(CONV, tmpVal1, norm); \\\n\
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
     _viv_asm(COPY, outval, dst, 16); \\\n\
@@ -11794,13 +11959,11 @@ INSTANCE_NORM_8_TO_F16_IMPL(U8_F32toF16, vxc_uchar16)\n\
 INSTANCE_NORM_8_TO_F16_IMPL(I8_F32toF16, vxc_char16)\n\
 \n\
 #define INSTANCE_NORM_8_TO_F16_IMPL_2D(name, src_type) \\\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \\\n\
+__kernel void instance_norm_##name##_2D( \\\n\
     __read_only  image2d_array_t input, \\\n\
-    __read_only  image2d_t       bias, \\\n\
-    __read_only  image2d_t       scale, \\\n\
-    __read_only  image2d_t       meanVari, \\\n\
+    __read_only  image2d_t       means, \\\n\
     __write_only image2d_array_t output, \\\n\
-                 float eps, int rs_flag) \\\n\
+                 int height) \\\n\
 { \\\n\
     int gidz = get_global_id(1); \\\n\
     int gidy = gidz * height; \\\n\
@@ -11810,26 +11973,11 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
     src_type src0; \\\n\
     vxc_short8 outval; \\\n\
     vxc_half8 dst; \\\n\
-    float scale_vari, bias_val; \\\n\
-    float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\
+    float4 coef; \\\n\
  \\\n\
-    scale_f = read_imagef(scale, coord_para.xy); \\\n\
-    bias_f = read_imagef(bias, coord_para.xy); \\\n\
-    for(int i = 0; i < group_num; i++) \\\n\
-    { \\\n\
-        mean_vari += read_imagef(meanVari, coord_para.xy); \\\n\
-        coord_para.x += 4; \\\n\
-    } \\\n\
-    mean_vari *= inv_multiplier; \\\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\
- \\\n\
-    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
+    coef = read_imagef(means, coord_para.xy); \\\n\
     float4  tmpData0, tmpData1, tmpData2, tmpData3, norm; \\\n\
     half4 tmpVal0, tmpVal1; \\\n\
-    float alpha = scale_vari * input_scale; \\\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0); \\\n\
-    bias_val = bias_val - input_zp * alpha; \\\n\
     for(; coord.y < endH;) \\\n\
     { \\\n\
     VXC_ReadImage(src0, input, coord.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
@@ -11839,17 +11987,17 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
     VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\
     VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_2_4x4); \\\n\
     VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_3_4x4); \\\n\
-    norm = alpha * tmpData0 + bias_val; \\\n\
+    norm = tmpData0 * coef.x + coef.y; \\\n\
     _viv_asm(CONV, tmpVal0, norm); \\\n\
-    norm = alpha * tmpData1 + bias_val; \\\n\
+    norm = tmpData1 * coef.x + coef.y; \\\n\
     _viv_asm(CONV, tmpVal1, norm); \\\n\
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
     _viv_asm(COPY, outval, dst, 16); \\\n\
     VXC_WriteImage(output, coord_para.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
     coord_para.x += 8; \\\n\
-    norm = alpha * tmpData2 + bias_val; \\\n\
+    norm = tmpData2 * coef.x + coef.y; \\\n\
     _viv_asm(CONV, tmpVal0, norm); \\\n\
-    norm = alpha * tmpData3 + bias_val; \\\n\
+    norm = tmpData3 * coef.x + coef.y; \\\n\
     _viv_asm(CONV, tmpVal1, norm); \\\n\
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
     _viv_asm(COPY, outval, dst, 16); \\\n\
@@ -11863,28 +12011,21 @@ INSTANCE_NORM_8_TO_F16_IMPL_2D(I8_F32toF16, vxc_char16)\n\
 static const char instance_normalization_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int width;\n\
-_viv_uniform int height;\n\
-_viv_uniform float inv_multiplier;\n\
-_viv_uniform int group_num;\n\
 _viv_uniform VXC_512Bits uniDataToFP32_0_4x4;\n\
 _viv_uniform VXC_512Bits uniDataToFP32_1_4x4;\n\
 _viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
 _viv_uniform VXC_512Bits uniSum_X_X2_8x2;\n\
 _viv_uniform float input_scale;\n\
 _viv_uniform float input_scale2;\n\
-_viv_uniform float input_zp;\n\
 _viv_uniform float sum_x_tail;\n\
 _viv_uniform float sum_x2_tail0;\n\
 _viv_uniform float sum_x2_tail1;\n\
 \n\
-_viv_uniform float output_scale;\n\
-_viv_uniform float output_zp;\n\
-\n\
 #define INSTANCE_NORM_SUMS_16BITS_IMPL(name, src_type) \\\n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name( \\\n\
     __read_only  image2d_array_t input, \\\n\
     __write_only image2d_array_t output, \\\n\
-                 float eps, int rs_flag) \\\n\
+                 float eps, int height) \\\n\
 { \\\n\
     int gidx = get_global_id(0) << 3; \\\n\
     int lidx = get_local_id(0); \\\n\
@@ -11949,7 +12090,7 @@ INSTANCE_NORM_SUMS_16BITS_IMPL(I16, vxc_short8)\n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_##name##_2D( \\\n\
     __read_only  image2d_array_t input, \\\n\
     __write_only image2d_array_t output, \\\n\
-                 float eps, int rs_flag) \\\n\
+                 float eps, int height) \\\n\
 { \\\n\
     int gidx = get_global_id(0) << 3; \\\n\
     int lidx = get_local_id(0); \\\n\
@@ -12008,13 +12149,11 @@ INSTANCE_NORM_SUMS_16BITS_IMPL_2D(F16, vxc_half8)\n\
 INSTANCE_NORM_SUMS_16BITS_IMPL_2D(I16, vxc_short8)\n\
 \n\
 #define INSTANCE_NORM_16BITS_IMPL(name, src_type, dst_type, copy_type, conv_type) \\\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name( \\\n\
+__kernel void instance_norm_##name( \\\n\
     __read_only  image2d_array_t input, \\\n\
-    __read_only  image2d_t       bias, \\\n\
-    __read_only  image2d_t       scale, \\\n\
-    __read_only  image2d_t       meanVari, \\\n\
+    __read_only  image2d_t       means, \\\n\
     __write_only image2d_array_t output, \\\n\
-                 float eps, int rs_flag) \\\n\
+                 int height) \\\n\
 { \\\n\
     int gidz = get_global_id(1); \\\n\
     int4 coord = (int4)(get_global_id(0), 0, gidz, gidz); \\\n\
@@ -12022,28 +12161,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
     int4 coord_para = (int4)(0, gidz, 0, 0); \\\n\
     vxc_short8 src0; \\\n\
     src_type in_h; \\\n\
-    float scale_vari, bias_val; \\\n\
-    float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\
+    float4 coef; \\\n\
  \\\n\
-    scale_f = read_imagef(scale, coord_para.xy); \\\n\
-    bias_f = read_imagef(bias, coord_para.xy); \\\n\
+    coef = read_imagef(means, coord_para.xy); \\\n\
  \\\n\
-    for(int i = 0; i < group_num; i++) \\\n\
-    { \\\n\
-        mean_vari += read_imagef(meanVari, coord_para.xy); \\\n\
-        coord_para.x += 4; \\\n\
-    } \\\n\
-    mean_vari *= inv_multiplier; \\\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\
- \\\n\
-    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
-    float alpha = input_scale * output_scale * scale_vari; \\\n\
     float4  tmpData0, tmpData1; \\\n\
     copy_type outval; \\\n\
     conv_type tmpVal0, tmpVal1; \\\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\
-    bias_val = bias_val - input_zp * alpha; \\\n\
     dst_type dst; \\\n\
  \\\n\
     int8 input_desc, output_desc; \\\n\
@@ -12066,9 +12190,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
     VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\
  \\\n\
     float4 norm; \\\n\
-    norm = alpha * tmpData0 + bias_val; \\\n\
+    norm = tmpData0 * coef.x + coef.y; \\\n\
     _viv_asm(CONV_RTE, tmpVal0, norm); \\\n\
-    norm = alpha * tmpData1 + bias_val; \\\n\
+    norm = tmpData1 * coef.x + coef.y; \\\n\
     _viv_asm(CONV_RTE, tmpVal1, norm); \\\n\
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
     _viv_asm(COPY, outval, dst, 16); \\\n\
@@ -12083,13 +12207,11 @@ INSTANCE_NORM_16BITS_IMPL(I16_F32toI16, vxc_short8, vxc_short8, vxc_short8, int4
 INSTANCE_NORM_16BITS_IMPL(I16_F32toF16, vxc_short8, vxc_half8,  vxc_short8, half4)\n\
 \n\
 #define INSTANCE_NORM_16BITS_IMPL_2D(name, src_type, dst_type, copy_type, conv_type) \\\n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##name##_2D( \\\n\
+__kernel void instance_norm_##name##_2D( \\\n\
     __read_only  image2d_array_t input, \\\n\
-    __read_only  image2d_t       bias, \\\n\
-    __read_only  image2d_t       scale, \\\n\
-    __read_only  image2d_t       meanVari, \\\n\
+    __read_only  image2d_t       means, \\\n\
     __write_only image2d_array_t output, \\\n\
-                 float eps, int rs_flag) \\\n\
+                 int height) \\\n\
 { \\\n\
     int gidz = get_global_id(1); \\\n\
     int gidy = gidz * height; \\\n\
@@ -12098,28 +12220,13 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
     int endH = gidy + height; \\\n\
     vxc_short8 src0; \\\n\
     src_type in_h; \\\n\
-    float scale_vari, bias_val; \\\n\
-    float4 bias_f, scale_f, mean_vari = (float4)(0); \\\n\
+    float4 coef; \\\n\
  \\\n\
-    scale_f = read_imagef(scale, coord_para.xy); \\\n\
-    bias_f = read_imagef(bias, coord_para.xy); \\\n\
+    coef = read_imagef(means, coord_para.xy); \\\n\
  \\\n\
-    for(int i = 0; i < group_num; i++) \\\n\
-    { \\\n\
-        mean_vari += read_imagef(meanVari, coord_para.xy); \\\n\
-        coord_para.x += 4; \\\n\
-    } \\\n\
-    mean_vari *= inv_multiplier; \\\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps; \\\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1); \\\n\
- \\\n\
-    scale_vari = scale_f.s0 * mean_vari.s1; \\\n\
-    float alpha = input_scale * output_scale * scale_vari; \\\n\
     float4  tmpData0, tmpData1; \\\n\
     copy_type outval; \\\n\
     conv_type tmpVal0, tmpVal1; \\\n\
-    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_scale + output_zp; \\\n\
-    bias_val = bias_val - input_zp * alpha; \\\n\
     dst_type dst; \\\n\
  \\\n\
     for(; coord.y < endH; coord.y++) \\\n\
@@ -12130,9 +12237,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_##na
     VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_0_4x4); \\\n\
     VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDataToFP32_1_4x4); \\\n\
     float4 norm; \\\n\
-    norm = alpha * tmpData0 + bias_val; \\\n\
+    norm = tmpData0 * coef.x + coef.y; \\\n\
     _viv_asm(CONV, tmpVal0, norm); \\\n\
-    norm = alpha * tmpData1 + bias_val; \\\n\
+    norm = tmpData1 * coef.x + coef.y; \\\n\
     _viv_asm(CONV, tmpVal1, norm); \\\n\
     VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtract8Data_2x8); \\\n\
     _viv_asm(COPY, outval, dst, 16); \\\n\
@@ -12150,15 +12257,13 @@ INSTANCE_NORM_16BITS_IMPL_2D(I16_F32toF16, vxc_short8, vxc_half8,  vxc_short8, h
 static const char instance_normalization_3_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int width;\n\
-_viv_uniform int height;\n\
-_viv_uniform float inv_multiplier;\n\
-_viv_uniform int group_num;\n\
+\n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
 _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
 \n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_BF16(\n\
-    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\
+    image2d_array_t input, image2d_array_t output, float eps, int height)\n\
 {\n\
     int gidx = get_global_id(0) << 3;\n\
     int lidx = get_local_id(0);\n\
@@ -12219,7 +12324,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums
 }\n\
 \n\
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums_BF16_2D(\n\
-    image2d_array_t input, image2d_array_t output, float eps, int rsFlg)\n\
+    image2d_array_t input, image2d_array_t output, float eps, int height)\n\
 {\n\
     int gidx = get_global_id(0) << 3;\n\
     int lidx = get_local_id(0);\n\
@@ -12278,36 +12383,21 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_sums
     }\n\
 }\n\
 \n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16_F32toBF16(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int rsFlg)\n\
+__kernel void instance_norm_BF16_F32toBF16(\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_t       means,\n\
+    __write_only image2d_array_t output,\n\
+                 int height)\n\
 {\n\
     int gidz = get_global_id(1);\n\
     int4 coord = (int4)(get_global_id(0), 0, gidz, gidz);\n\
     int4 coord_in = (int4)(get_global_id(0), 0, gidz, gidz);\n\
     vxc_short8 src0, src1, src2;\n\
-    float scale_vari, bias_val;\n\
-    float4 mean_vari = (float4)(0);\n\
+    float4 coef;\n\
 \n\
-    Image img3 = create_image_from_image2d(meanVari, 4);\n\
-    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, (int2)(0, gidz));\n\
-    __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\
+    coef = read_imagef(means, coord.yz);\n\
 \n\
-    float sval = read_imagef(scale, coord.yz).x;\n\
-    float bval = read_imagef(bias, coord.yz).x;\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += vari_ptr[i];\n\
-    }\n\
-\n\
-    mean_vari *= inv_multiplier;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    scale_vari = sval * mean_vari.s1;\n\
     float4  tmpData0, tmpData1;\n\
-    bias_val = (bval - scale_vari * mean_vari.s0);\n\
 \n\
     int8 input_desc, output_desc;\n\
     _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
@@ -12320,6 +12410,7 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
 \n\
     for(coord.y = 0; coord.y < height; coord.y++)\n\
     {\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
     VXC_OP4(img_load_3d, src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), \\\n\
                     VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
     coord_in.y ++;\n\
@@ -12331,9 +12422,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
     _viv_asm(COPY, tmpData1, src2, 16);\n\
 \n\
     float4 norm;\n\
-    norm = scale_vari * tmpData0 + bias_val;\n\
+    norm = tmpData0 * coef.x + coef.y;\n\
     _viv_asm(COPY, src0, norm, 16);\n\
-    norm = scale_vari * tmpData1 + bias_val;\n\
+    norm = tmpData1 * coef.x + coef.y;\n\
     _viv_asm(COPY, src1, norm, 16);\n\
     VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
     VXC_OP4_NoDest(img_store_3d, output, coord, src2, \\\n\
@@ -12341,41 +12432,27 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
     }\n\
 }\n\
 \n\
-__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16F32toBF16_2D(\n\
-    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
-    image2d_array_t output, float eps, int rsFlg)\n\
+__kernel void instance_norm_BF16_F32toBF16_2D(\n\
+    __read_only  image2d_array_t input,\n\
+    __read_only  image2d_t       means,\n\
+    __write_only image2d_array_t output,\n\
+                 int height)\n\
 {\n\
     int gidz = get_global_id(1);\n\
     int gidy = gidz * height;\n\
     int2 coord = (int2)(get_global_id(0), gidy);\n\
-    int2 coord_para = (int2)(gidz, 0);\n\
+    int2 coord_para = (int2)(0, gidz);\n\
     int endH = gidy + height;\n\
     vxc_short8 src0, src1, src2;\n\
-    float scale_vari, bias_val;\n\
-    float4 mean_vari = (float4)(0);\n\
+    float4 coef;\n\
 \n\
-    Image img3 = create_image_from_image2d(meanVari, 4);\n\
-    __global uchar* sumVari_ptr = (__global uchar*)get_image_ptr_from_coord(img3, coord_para.yx);\n\
-    __global float4* vari_ptr = (__global float4*)sumVari_ptr;\n\
+    coef = read_imagef(means, coord_para);\n\
 \n\
-    float sval = read_imagef(scale, coord_para.yx).x;\n\
-    float bval = read_imagef(bias, coord_para.yx).x;\n\
-\n\
-    for(int i = 0; i < group_num; i++)\n\
-    {\n\
-        mean_vari += vari_ptr[i];\n\
-    }\n\
-\n\
-    mean_vari *= inv_multiplier;\n\
-    mean_vari.s1 = mean_vari.s1 - mean_vari.s0 * mean_vari.s0 + eps;\n\
-    mean_vari.s1 = rsqrt(mean_vari.s1);\n\
-\n\
-    scale_vari = sval * mean_vari.s1;\n\
     float4  tmpData0, tmpData1;\n\
-    bias_val = (bval - scale_vari * mean_vari.s0);\n\
 \n\
     for(; coord.y < endH; coord.y++)\n\
     {\n\
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
     VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
         VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
     VXC_DP2x8(src1, src0, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
@@ -12386,9 +12463,9 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void instance_norm_BF16
     _viv_asm(COPY, tmpData1, src2, 16);\n\
 \n\
     float4 norm;\n\
-    norm = scale_vari * tmpData0 + bias_val;\n\
+    norm = tmpData0 * coef.x + coef.y;\n\
     _viv_asm(COPY, src0, norm, 16);\n\
-    norm = scale_vari * tmpData1 + bias_val;\n\
+    norm = tmpData1 * coef.x + coef.y;\n\
     _viv_asm(COPY, src1, norm, 16);\n\
     VXC_DP2x8(src2, src0, src1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
     VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
@@ -12547,7 +12624,7 @@ _viv_uniform int inputZP;\n\
                     VXC_Vstore3(dst_ptr, 0, dst.s012); \\\n\
                 break; \\\n\
                 case 4: \\\n\
-                    VXC_Vstore4(dst_ptr, 0, dst.0123); \\\n\
+                    VXC_Vstore4(dst_ptr, 0, dst.s0123); \\\n\
                 break; \\\n\
                 case 5: \\\n\
                     VXC_Vstore2(dst_ptr, 0, dst.s01); \\\n\
@@ -12562,7 +12639,7 @@ _viv_uniform int inputZP;\n\
                     VXC_Vstore3(dst_ptr, 0, dst.s012); \\\n\
                 break; \\\n\
                 case 7: \\\n\
-                    VXC_Vstore4(dst_ptr, 0, dst.0123); \\\n\
+                    VXC_Vstore4(dst_ptr, 0, dst.s0123); \\\n\
                      dst.s012 = dst.s456; \\\n\
                     dst_ptr += 4; \\\n\
                     VXC_Vstore3(dst_ptr, 0, dst.s012); \\\n\
@@ -20912,6 +20989,11 @@ _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
 _viv_uniform int ac2zero;\n\
 _viv_uniform int bc2zero;\n\
 \n\
+_viv_uniform VXC_512Bits uniI16MulI16SumtoI32_16x1;\n\
+_viv_uniform VXC_512Bits uniI16MulI16SumtoI32B_16x1;\n\
+_viv_uniform float inout_beta;\n\
+_viv_uniform float inout_scale;\n\
+\n\
 #define GEMM_QINT_TO_QINT(src0_type_name, read_type) \\\n\
 __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \\\n\
         image2d_array_t inputA, image2d_array_t inputB, image2d_array_t output, \\\n\
@@ -21004,6 +21086,142 @@ __kernel void gemm_##src0_type_name##src0_type_name##to##src0_type_name( \\\n\
                 VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \\\n\
 }\n\
 GEMM_QINT_TO_QINT(I16, vxc_short8)\n\
+\n\
+__kernel void gemm_transb_I16I16toI16(image2d_array_t inputA,\n\
+        image2d_array_t inputB, image2d_array_t output,\n\
+        int transposeA, int transposeB, int adjointA, int adjointB,\n\
+        uint M, uint K, uint N)\n\
+{\n\
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+    int4 coord_a = (int4)(0, coord_out.y, (ac2zero ? 0 : get_global_id(2)), 0);\n\
+    int4 coord_b = (int4)(0, coord_out.x, (bc2zero ? 0 : get_global_id(2)), 0);\n\
+\n\
+    vxc_float4 sum0 = (vxc_float4)(0);\n\
+    vxc_float4 sum1 = (vxc_float4)(0);\n\
+    vxc_float4 sum2 = (vxc_float4)(0);\n\
+    vxc_float4 sum3 = (vxc_float4)(0);\n\
+\n\
+    int8 inputA_desc, inputB_desc, output_desc;\n\
+    _viv_asm(COPY, inputA_desc, inputA, sizeof(inputA_desc));\n\
+    int baseAddr_a = (int)coord_a.z * inputA_desc.s4 + inputA_desc.s0;\n\
+    _viv_asm(MOV, coord_a.w, baseAddr_a);\n\
+    _viv_asm(COPY, inputB_desc, inputB, sizeof(inputB_desc));\n\
+    int baseAddr_b = (int)coord_b.z * inputB_desc.s4 + inputB_desc.s0;\n\
+    _viv_asm(MOV, coord_b.w, baseAddr_b);\n\
+\n\
+    for(coord_a.x = 0, coord_b.x = 0; coord_a.x < K;)\n\
+    {\n\
+        vxc_short8 srcA0,srcA1,srcA2,srcA3;\n\
+        vxc_short8 srcB0,srcB1,srcB2,srcB3;\n\
+        VXC_OP4(img_load_3d, srcA0, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcA1, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcA2, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcA3, inputA, coord_a.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcB0, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcB1, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcB2, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 2),\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_OP4(img_load_3d, srcB3, inputB, coord_b.xywz, VXC_5BITOFFSET_XY(0, 3),\n\
+                    VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        coord_a.x += 8;\n\
+        coord_b.x += 8;\n\
+\n\
+        vxc_int4 iVal;\n\
+        vxc_float4 fpVal;\n\
+        VXC_DP16x1(iVal, srcA0, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32_16x1);\n\
+        VXC_DP16x1(fpVal, iVal, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32B_16x1);\n\
+        VXC_DP16x1(iVal, srcA0, srcB1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32_16x1);\n\
+        VXC_DP16x1(fpVal, iVal, srcB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32B_16x1);\n\
+        VXC_DP16x1(iVal, srcA0, srcB2, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32_16x1);\n\
+        VXC_DP16x1(fpVal, iVal, srcB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32B_16x1);\n\
+        VXC_DP16x1(iVal, srcA0, srcB3, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32_16x1);\n\
+        VXC_DP16x1(fpVal, iVal, srcB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32B_16x1);\n\
+        sum0 = sum0 + fpVal * inout_scale + inout_beta;\n\
+\n\
+        VXC_DP16x1(iVal, srcA1, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32_16x1);\n\
+        VXC_DP16x1(fpVal, iVal, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32B_16x1);\n\
+        VXC_DP16x1(iVal, srcA1, srcB1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32_16x1);\n\
+        VXC_DP16x1(fpVal, iVal, srcB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32B_16x1);\n\
+        VXC_DP16x1(iVal, srcA1, srcB2, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32_16x1);\n\
+        VXC_DP16x1(fpVal, iVal, srcB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32B_16x1);\n\
+        VXC_DP16x1(iVal, srcA1, srcB3, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32_16x1);\n\
+        VXC_DP16x1(fpVal, iVal, srcB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32B_16x1);\n\
+        sum1 = sum1 + fpVal * inout_scale + inout_beta;\n\
+\n\
+        VXC_DP16x1(iVal, srcA2, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32_16x1);\n\
+        VXC_DP16x1(fpVal, iVal, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32B_16x1);\n\
+        VXC_DP16x1(iVal, srcA2, srcB1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32_16x1);\n\
+        VXC_DP16x1(fpVal, iVal, srcB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32B_16x1);\n\
+        VXC_DP16x1(iVal, srcA2, srcB2, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32_16x1);\n\
+        VXC_DP16x1(fpVal, iVal, srcB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32B_16x1);\n\
+        VXC_DP16x1(iVal, srcA2, srcB3, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32_16x1);\n\
+        VXC_DP16x1(fpVal, iVal, srcB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32B_16x1);\n\
+        sum2 = sum2 + fpVal * inout_scale + inout_beta;\n\
+\n\
+        VXC_DP16x1(iVal, srcA3, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32_16x1);\n\
+        VXC_DP16x1(fpVal, iVal, srcB0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32B_16x1);\n\
+        VXC_DP16x1(iVal, srcA3, srcB1, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32_16x1);\n\
+        VXC_DP16x1(fpVal, iVal, srcB1, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32B_16x1);\n\
+        VXC_DP16x1(iVal, srcA3, srcB2, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32_16x1);\n\
+        VXC_DP16x1(fpVal, iVal, srcB2, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32B_16x1);\n\
+        VXC_DP16x1(iVal, srcA3, srcB3, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32_16x1);\n\
+        VXC_DP16x1(fpVal, iVal, srcB3, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0),\n\
+            uniI16MulI16SumtoI32B_16x1);\n\
+        sum3 = sum3 + fpVal * inout_scale + inout_beta;\n\
+    }\n\
+    vxc_int4 tmpOut0, tmpOut1;\n\
+    vxc_short8 valDst;\n\
+    tmpOut0 = convert_int4_rte(sum0);\n\
+    tmpOut1 = convert_int4_rte(sum1);\n\
+    VXC_DP2x8(valDst, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+    coord_out.y++;\n\
+    VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+    coord_out.y++;\n\
+    tmpOut0 = convert_int4_rte(sum2);\n\
+    tmpOut1 = convert_int4_rte(sum3);\n\
+    VXC_DP2x8(valDst, tmpOut0, tmpOut1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    VXC_WriteImage2DArray(output, coord_out, valDst.s0123, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+    coord_out.y++;\n\
+    VXC_WriteImage2DArray(output, coord_out, valDst.s4567, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
 "; /* end of matrixmul_i16_vx*/
 
 static const char matrixmul_transA_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -27810,6 +28028,94 @@ PRE_PROCESS_GRAY_COPY_8BITS(U8, vxc_uchar16)\n\
 PRE_PROCESS_GRAY_COPY_8BITS(I8, vxc_char16)\n\
 "; /* end of pre_process_gray_copy_vx*/
 
+static const char pre_process_nv12_copy_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int bOrder;\n\
+_viv_uniform int rOrder;\n\
+\n\
+_viv_uniform float outputScaleVar;\n\
+_viv_uniform float bMeanScaleVarZp;\n\
+_viv_uniform float gMeanScaleVarZp;\n\
+_viv_uniform float rMeanScaleVarZp;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;\n\
+\n\
+#define NV12_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \\\n\
+__kernel void pre_process_nv12_copy_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t y_img, \\\n\
+    __read_only  image2d_array_t uv_img, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    global       int*            xRatio, \\\n\
+    global       int*            yRatio, \\\n\
+    global       int*            xOffset, \\\n\
+    global       int*            yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           var, \\\n\
+                 int             reverse_channel, \\\n\
+                 int             trans \\\n\
+    ) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+ \\\n\
+    int sy = gidy + (*yOffset); \\\n\
+    int sx = gidx + (*xOffset); \\\n\
+    int uvX = sx & 0xfffffffe; \\\n\
+    int uvY = sy >> 1; \\\n\
+ \\\n\
+    vxc_uchar16 Y, UV; \\\n\
+ \\\n\
+    VXC_ReadImage(Y, y_img, (int2)(sx,sy), 0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(UV, uv_img,(int2)(uvX,uvY), 0,VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    vxc_char16 tmpUV; \\\n\
+    short tmpVal = 128; \\\n\
+    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \\\n\
+ \\\n\
+    float4 tmpDstB, tmpDstG, tmpDstR; \\\n\
+    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \\\n\
+    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \\\n\
+    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \\\n\
+ \\\n\
+    conv_type result; \\\n\
+    dst_type dst0; \\\n\
+    save_type dst; \\\n\
+    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
+    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstB); \\\n\
+    dstPos.z = bOrder; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstG); \\\n\
+    dstPos.z = 1; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstR); \\\n\
+    dstPos.z = rOrder; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+NV12_COPY_SH_IMPL(U8toU8,  vxc_uchar8, int4,  vxc_uchar8, 8)\n\
+NV12_COPY_SH_IMPL(U8toI8,  vxc_char8,  int4,  vxc_char8,  8)\n\
+NV12_COPY_SH_IMPL(U8toI16, vxc_short8, int4,  vxc_short8, 16)\n\
+NV12_COPY_SH_IMPL(U8toF16, vxc_half8,  half4, vxc_short8, 16)\n\
+"; /* end of pre_process_nv12_copy_vx*/
+
 static const char pre_process_nv12_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int bOrder;\n\
@@ -27820,363 +28126,6 @@ _viv_uniform float bMeanScaleVarZp;\n\
 _viv_uniform float gMeanScaleVarZp;\n\
 _viv_uniform float rMeanScaleVarZp;\n\
 \n\
-_viv_uniform uint xrIntFloat_16;\n\
-_viv_uniform uint yrIntFloat_16;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;\n\
-\n\
-__kernel void pre_process_nv12_scale_U8toI16(\n\
-    __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\
-    __write_only image2d_array_t    output,\n\
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\
-{\n\
-    uint4 gidx = get_global_id(0);\n\
-    uint gidy = get_global_id(1);\n\
-    gidx += (uint4)(0, 1, 2, 3);\n\
-\n\
-    uint dy = (gidy * yrIntFloat_16) >> 16;\n\
-    uint4 dx = (gidx * xrIntFloat_16) >> 16;\n\
-    int sy = convert_int(dy) + (*yOffset);\n\
-    int4 sx = convert_int4(dx) + (*xOffset);\n\
-    int4 uvX = sx & 0xfffffffe;\n\
-    int uvY = sy >> 1;\n\
-\n\
-    vxc_uchar16 Y, UV;\n\
-    int2 coord = (int2)(sx.x, sy);\n\
-    int2 coord_uv = (int2)(uvX.x, uvY);\n\
-\n\
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord.x = sx.y;\n\
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord.x = sx.z;\n\
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord.x = sx.w;\n\
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_uv.x = uvX.y;\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
-    coord_uv.x = uvX.z;\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
-    coord_uv.x = uvX.w;\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    vxc_char16 tmpUV;\n\
-    short tmpVal = 128;\n\
-    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);\n\
-\n\
-    float4 tmpDstB, tmpDstG, tmpDstR;\n\
-    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\
-    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\
-    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\
-\n\
-    int4 result;\n\
-    vxc_short8 dst;\n\
-    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\
-    result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);\n\
-    dstPos.z = bOrder;\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);\n\
-    dstPos.z = 1;\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);\n\
-    dstPos.z = rOrder;\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pre_process_nv12_scale_U8toF16(\n\
-    __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\
-    __write_only image2d_array_t    output,\n\
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\
-{\n\
-    uint4 gidx = get_global_id(0);\n\
-    uint gidy = get_global_id(1);\n\
-    gidx += (uint4)(0, 1, 2, 3);\n\
-\n\
-    uint dy = (gidy * yrIntFloat_16) >> 16;\n\
-    uint4 dx = (gidx * xrIntFloat_16) >> 16;\n\
-    int sy = convert_int(dy) + (*yOffset);\n\
-    int4 sx = convert_int4(dx) + (*xOffset);\n\
-    int4 uvX = sx & 0xfffffffe;\n\
-    int uvY = sy >> 1;\n\
-\n\
-    vxc_uchar16 Y, UV;\n\
-    int2 coord = (int2)(sx.x, sy);\n\
-    int2 coord_uv = (int2)(uvX.x, uvY);\n\
-\n\
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord.x = sx.y;\n\
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord.x = sx.z;\n\
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord.x = sx.w;\n\
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_uv.x = uvX.y;\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
-    coord_uv.x = uvX.z;\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
-    coord_uv.x = uvX.w;\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    vxc_char16 tmpUV;\n\
-    short tmpVal = 128;\n\
-    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);\n\
-\n\
-    float4 tmpDstB, tmpDstG, tmpDstR;\n\
-    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\
-    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\
-    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\
-\n\
-    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp;\n\
-    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp;\n\
-    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp;\n\
-\n\
-    half4 result;\n\
-    vxc_half8 tmpdst;\n\
-    vxc_short8 dst;\n\
-    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\
-    _viv_asm(CONV, result, tmpDstB);\n\
-    dstPos.z = bOrder;\n\
-    VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\
-    _viv_asm(COPY, dst, tmpdst, 16);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(CONV, result, tmpDstG);\n\
-    dstPos.z = 1;\n\
-    VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\
-    _viv_asm(COPY, dst, tmpdst, 16);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(CONV, result, tmpDstR);\n\
-    dstPos.z = rOrder;\n\
-    VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\
-    _viv_asm(COPY, dst, tmpdst, 16);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-}"; /* end of pre_process_nv12_scale_vx*/
-
-static const char pre_process_nv12_scale_8bits_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform int bOrder;\n\
-_viv_uniform int rOrder;\n\
-\n\
-_viv_uniform float outputScaleVar;\n\
-_viv_uniform float bMeanScaleVarZp;\n\
-_viv_uniform float gMeanScaleVarZp;\n\
-_viv_uniform float rMeanScaleVarZp;\n\
-\n\
-_viv_uniform uint  xrIntFloat_16;\n\
-_viv_uniform uint  yrIntFloat_16;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertNV12toB_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertNV12toG_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertNV12toR_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-_viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;\n\
-_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;\n\
-\n\
-__kernel void pre_process_nv12_scale_U8toU8(\n\
-    __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\
-    __write_only image2d_array_t    output,\n\
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\
-{\n\
-    uint4 gidx = get_global_id(0);\n\
-    uint gidy = get_global_id(1);\n\
-    gidx += (uint4)(0, 1, 2, 3);\n\
-\n\
-    uint dy = (gidy * yrIntFloat_16) >> 16;\n\
-    uint4 dx = (gidx * xrIntFloat_16) >> 16;\n\
-    int sy = convert_int(dy) + (*yOffset);\n\
-    int4 sx = convert_int4(dx) + (*xOffset);\n\
-    int4 uvX = sx & 0xfffffffe;\n\
-    int uvY = sy >> 1;\n\
-\n\
-    vxc_uchar16 Y, UV;\n\
-    int2 coord = (int2)(sx.x, sy);\n\
-    int2 coord_uv = (int2)(uvX.x, uvY);\n\
-\n\
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord.x = sx.y;\n\
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord.x = sx.z;\n\
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord.x = sx.w;\n\
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_uv.x = uvX.y;\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
-    coord_uv.x = uvX.z;\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
-    coord_uv.x = uvX.w;\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    vxc_char16 tmpUV;\n\
-    short tmpVal = 128;\n\
-    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);\n\
-\n\
-    float4 tmpDstB, tmpDstG, tmpDstR;\n\
-    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\
-    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\
-    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\
-\n\
-    int4 result;\n\
-    vxc_uchar8 dst;\n\
-    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\
-    result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);\n\
-    dstPos.z = bOrder;\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);\n\
-    dstPos.z = 1;\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);\n\
-    dstPos.z = rOrder;\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pre_process_nv12_copy_U8toU8(\n\
-    __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\
-    __write_only image2d_array_t    output,\n\
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\
-{\n\
-    int gidx = get_global_id(0);\n\
-    int gidy = get_global_id(1);\n\
-\n\
-    int sy = gidy + (*yOffset);\n\
-    int sx = gidx + (*xOffset);\n\
-    int uvX = sx & 0xfffffffe;\n\
-    int uvY = sy >> 1;\n\
-\n\
-    vxc_uchar16 Y, UV;\n\
-\n\
-    VXC_ReadImage(Y, y_img, (int2)(sx,sy), VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(UV, uv_img,(int2)(uvX,uvY), VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    vxc_char16 tmpUV;\n\
-    short tmpVal = 128;\n\
-    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8);\n\
-\n\
-    float4 tmpDstB, tmpDstG, tmpDstR;\n\
-    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\
-    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\
-    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\
-\n\
-    int4 result;\n\
-    vxc_uchar8 dst;\n\
-    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\
-    result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);\n\
-    dstPos.z = bOrder;\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);\n\
-    dstPos.z = 1;\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);\n\
-    dstPos.z = rOrder;\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-__kernel void pre_process_nv12_scale_U8toI8(\n\
-    __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\
-    __write_only image2d_array_t    output,\n\
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\
-{\n\
-    uint4 gidx = get_global_id(0);\n\
-    uint gidy = get_global_id(1);\n\
-    gidx += (uint4)(0, 1, 2, 3);\n\
-\n\
-    uint dy = (gidy * yrIntFloat_16) >> 16;\n\
-    uint4 dx = (gidx * xrIntFloat_16) >> 16;\n\
-    int sy = convert_int(dy) + (*yOffset);\n\
-    int4 sx = convert_int4(dx) + (*xOffset);\n\
-    int4 uvX = sx & 0xfffffffe;\n\
-    int uvY = sy >> 1;\n\
-\n\
-    vxc_uchar16 Y, UV;\n\
-    int2 coord = (int2)(sx.x, sy);\n\
-    int2 coord_uv = (int2)(uvX.x, uvY);\n\
-\n\
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord.x = sx.y;\n\
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord.x = sx.z;\n\
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord.x = sx.w;\n\
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_uv.x = uvX.y;\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
-    coord_uv.x = uvX.z;\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
-    coord_uv.x = uvX.w;\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    vxc_char16 tmpUV;\n\
-    short tmpVal = 128;\n\
-    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);\n\
-\n\
-    float4 tmpDstB, tmpDstG, tmpDstR;\n\
-    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\
-    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\
-    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\
-\n\
-    int4 result;\n\
-    vxc_char8 dst;\n\
-    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\
-    result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);\n\
-    dstPos.z = bOrder;\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);\n\
-    dstPos.z = 1;\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);\n\
-    dstPos.z = rOrder;\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-}"; /* end of pre_process_nv12_scale_8bits_vx*/
-
-static const char pre_process_nv12_scale_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform int bOrder;\n\
-_viv_uniform int rOrder;\n\
-\n\
-_viv_uniform float outputScaleVar;\n\
-_viv_uniform float bMeanScaleVarZp;\n\
-_viv_uniform float gMeanScaleVarZp;\n\
-_viv_uniform float rMeanScaleVarZp;\n\
-\n\
 _viv_uniform uint  xrIntFloat_16;\n\
 _viv_uniform uint  yrIntFloat_16;\n\
 \n\
@@ -28186,149 +28135,190 @@ _viv_uniform VXC_512Bits uniConvertNV12toR_4x4;\n\
 \n\
 _viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\
 \n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
 _viv_uniform VXC_512Bits uniConvertUVtoCharSub128_2x8;\n\
 \n\
 _viv_uniform VXC_512Bits uniCalculateYShift_2x8;\n\
 _viv_uniform VXC_512Bits uniCalculateUVShift_2x8;\n\
 \n\
-__kernel void pre_process_nv12_scale_U8toU8_gq(\n\
-    __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\
-    __write_only image2d_array_t    output,\n\
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\
-{\n\
-    uint4 gidx = get_global_id(0);\n\
-    uint gidy = get_global_id(1);\n\
-    gidx += (uint4)(0, 1, 2, 3);\n\
-\n\
-    uint dy = (gidy * yrIntFloat_16) >> 16;\n\
-    uint4 dx = (gidx * xrIntFloat_16) >> 16;\n\
-    int sy = convert_int(dy) + (*yOffset);\n\
-    int4 sx = convert_int4(dx) + (*xOffset);\n\
-    int4 uvX = sx & 0xfffffffe;\n\
-    int uvY = sy >> 1;\n\
-\n\
-    vxc_uchar16 Y, UV;\n\
-    int2 coord = (int2)(sx.x, sy);\n\
-    int2 coord_uv = (int2)(uvX.x, uvY);\n\
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\
-    vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\
-    int4 offsetUV = uvX - uvX.x;\n\
-\n\
-    vxc_ushort8 diffY, diffUV;\n\
-    _viv_asm(COPY, diffY, sx, 16);\n\
-    _viv_asm(COPY, diffUV, offsetUV, 16);\n\
-\n\
-    vxc_ushort8 constData = 8;\n\
-    VXC_DP2x8(maskShift, diffY, constData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniCalculateYShift_2x8);\n\
-    VXC_DP2x8(maskShiftUv, diffUV, constData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniCalculateUVShift_2x8);\n\
-    VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    vxc_char16 tmpUV;\n\
-    short tmpVal = 128;\n\
-    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);\n\
-\n\
-    float4 tmpDstB, tmpDstG, tmpDstR;\n\
-    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\
-    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\
-    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\
-\n\
-    int4 result;\n\
-    vxc_uchar8 dst;\n\
-    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\
-    result = convert_int4_rte(tmpDstB * outputScaleVar + bMeanScaleVarZp);\n\
-    dstPos.z = bOrder;\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    result = convert_int4_rte(tmpDstG * outputScaleVar + gMeanScaleVarZp);\n\
-    dstPos.z = 1;\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    result = convert_int4_rte(tmpDstR * outputScaleVar + rMeanScaleVarZp);\n\
-    dstPos.z = rOrder;\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+#define NV12_OPT_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \\\n\
+__kernel void pre_process_nv12_scale_##name##_gq \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t y_img, \\\n\
+    __read_only  image2d_array_t uv_img, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    global       int*            xRatio, \\\n\
+    global       int*            yRatio, \\\n\
+    global       int*            xOffset, \\\n\
+    global       int*            yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           var, \\\n\
+                 int             reverse_channel, \\\n\
+                 int             trans \\\n\
+    ) \\\n\
+{ \\\n\
+    uint4 gidx = get_global_id(0); \\\n\
+    uint gidy = get_global_id(1); \\\n\
+    gidx += (uint4)(0, 1, 2, 3); \\\n\
+ \\\n\
+    uint dy = (gidy * yrIntFloat_16) >> 16; \\\n\
+    uint4 dx = (gidx * xrIntFloat_16) >> 16; \\\n\
+    int sy = convert_int(dy) + (*yOffset); \\\n\
+    int4 sx = convert_int4(dx) + (*xOffset); \\\n\
+    int4 uvX = sx & 0xfffffffe; \\\n\
+    int uvY = sy >> 1; \\\n\
+ \\\n\
+    vxc_uchar16 Y, UV; \\\n\
+    int2 coord = (int2)(sx.x, sy); \\\n\
+    int2 coord_uv = (int2)(uvX.x, uvY); \\\n\
+    VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; \\\n\
+    vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8}; \\\n\
+    int4 offsetUV = uvX - uvX.x; \\\n\
+ \\\n\
+    vxc_ushort8 diffY, diffUV; \\\n\
+    _viv_asm(COPY, diffY, sx, 16); \\\n\
+    _viv_asm(COPY, diffUV, offsetUV, 16); \\\n\
+ \\\n\
+    vxc_ushort8 constData = 8; \\\n\
+    VXC_DP2x8(maskShift, diffY, constData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), \\\n\
+                uniCalculateYShift_2x8); \\\n\
+    VXC_DP2x8(maskShiftUv, diffUV, constData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                uniCalculateUVShift_2x8); \\\n\
+    VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    vxc_char16 tmpUV; \\\n\
+    short tmpVal = 128; \\\n\
+    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \\\n\
+ \\\n\
+    float4 tmpDstB, tmpDstG, tmpDstR; \\\n\
+    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \\\n\
+    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \\\n\
+    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \\\n\
+ \\\n\
+    conv_type result; \\\n\
+    dst_type dst0; \\\n\
+    save_type dst; \\\n\
+    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
+    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstB); \\\n\
+    dstPos.z = bOrder; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstG); \\\n\
+    dstPos.z = 1; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstR); \\\n\
+    dstPos.z = rOrder; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
+NV12_OPT_SH_IMPL(U8toU8,  vxc_uchar8, int4,  vxc_uchar8, 8)\n\
+NV12_OPT_SH_IMPL(U8toI8,  vxc_char8,  int4,  vxc_char8,  8)\n\
+NV12_OPT_SH_IMPL(U8toI16, vxc_short8, int4,  vxc_short8, 16)\n\
+NV12_OPT_SH_IMPL(U8toF16, vxc_half8,  half4, vxc_short8, 16)\n\
 \n\
-__kernel void pre_process_nv12_scale_U8toF16_gq(\n\
-    __read_only image2d_t y_img, __read_only image2d_t uv_img,\n\
-    __write_only image2d_array_t    output,\n\
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\
-{\n\
-    uint4 gidx = get_global_id(0);\n\
-    uint gidy = get_global_id(1);\n\
-    gidx += (uint4)(0, 1, 2, 3);\n\
-\n\
-    uint dy = (gidy * yrIntFloat_16) >> 16;\n\
-    uint4 dx = (gidx * xrIntFloat_16) >> 16;\n\
-    int sy = convert_int(dy) + (*yOffset);\n\
-    int4 sx = convert_int4(dx) + (*xOffset);\n\
-    int4 uvX = sx & 0xfffffffe;\n\
-    int uvY = sy >> 1;\n\
-\n\
-    vxc_uchar16 Y, UV;\n\
-    int2 coord = (int2)(sx.x, sy);\n\
-    int2 coord_uv = (int2)(uvX.x, uvY);\n\
-    VXC_ReadImage(Y, y_img, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(UV, uv_img,coord_uv, VXC_5BITOFFSET_XY(0,0),VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    vxc_uchar16 maskShift = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\
-    vxc_uchar16 maskShiftUv = {8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};\n\
-    int4 offsetUV = uvX - uvX.x;\n\
-\n\
-    vxc_ushort8 diffY, diffUV;\n\
-    _viv_asm(COPY, diffY, sx, 16);\n\
-    _viv_asm(COPY, diffUV, offsetUV, 16);\n\
-\n\
-    vxc_ushort8 constData = 8;\n\
-    VXC_DP2x8(maskShift, diffY, constData, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniCalculateYShift_2x8);\n\
-    VXC_DP2x8(maskShiftUv, diffUV, constData, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniCalculateUVShift_2x8);\n\
-    VXC_BitExtract(Y, Y, Y, maskShift, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_BitExtract(UV, UV, UV, maskShiftUv, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    vxc_char16 tmpUV;\n\
-    short tmpVal = 128;\n\
-    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8);\n\
-\n\
-    float4 tmpDstB, tmpDstG, tmpDstR;\n\
-    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4);\n\
-    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4);\n\
-    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4);\n\
-\n\
-    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp;\n\
-    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp;\n\
-    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp;\n\
-\n\
-    half4 result;\n\
-    vxc_half8 tmpdst;\n\
-    vxc_short8 dst;\n\
-    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\
-    _viv_asm(CONV, result, tmpDstB);\n\
-    dstPos.z = bOrder;\n\
-    VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\
-    _viv_asm(COPY, dst, tmpdst, 16);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(CONV, result, tmpDstG);\n\
-    dstPos.z = 1;\n\
-    VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\
-    _viv_asm(COPY, dst, tmpdst, 16);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    _viv_asm(CONV, result, tmpDstR);\n\
-    dstPos.z = rOrder;\n\
-    VXC_DP2x8(tmpdst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\
-    _viv_asm(COPY, dst, tmpdst, 16);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-}"; /* end of pre_process_nv12_scale_mix_vx*/
+#define NV12_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \\\n\
+__kernel void pre_process_nv12_scale_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t y_img, \\\n\
+    __read_only  image2d_array_t uv_img, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    global       int*            xRatio, \\\n\
+    global       int*            yRatio, \\\n\
+    global       int*            xOffset, \\\n\
+    global       int*            yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           var, \\\n\
+                 int             reverse_channel, \\\n\
+                 int             trans \\\n\
+    ) \\\n\
+{ \\\n\
+    uint4 gidx = get_global_id(0); \\\n\
+    uint gidy = get_global_id(1); \\\n\
+    gidx += (uint4)(0, 1, 2, 3); \\\n\
+ \\\n\
+    uint dy = (gidy * yrIntFloat_16) >> 16; \\\n\
+    uint4 dx = (gidx * xrIntFloat_16) >> 16; \\\n\
+    int sy = convert_int(dy) + (*yOffset); \\\n\
+    int4 sx = convert_int4(dx) + (*xOffset); \\\n\
+    int4 uvX = sx & 0xfffffffe; \\\n\
+    int uvY = sy >> 1; \\\n\
+ \\\n\
+    vxc_uchar16 Y, UV; \\\n\
+    int2 coord = (int2)(sx.x, sy); \\\n\
+    int2 coord_uv = (int2)(uvX.x, uvY); \\\n\
+ \\\n\
+    VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord.x = sx.y; \\\n\
+    VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord.x = sx.z; \\\n\
+    VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord.x = sx.w; \\\n\
+    VXC_ReadImage(Y, y_img, coord, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_uv.x = uvX.y; \\\n\
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_uv.x = uvX.z; \\\n\
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_uv.x = uvX.w; \\\n\
+    VXC_ReadImage(UV, uv_img,coord_uv, 0,VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    vxc_char16 tmpUV; \\\n\
+    short tmpVal = 128; \\\n\
+    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertUVtoCharSub128_2x8); \\\n\
+ \\\n\
+    float4 tmpDstB, tmpDstG, tmpDstR; \\\n\
+    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toB_4x4); \\\n\
+    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toG_4x4); \\\n\
+    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertNV12toR_4x4); \\\n\
+ \\\n\
+    conv_type result; \\\n\
+    dst_type dst0; \\\n\
+    save_type dst; \\\n\
+    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
+    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstB); \\\n\
+    dstPos.z = bOrder; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstG); \\\n\
+    dstPos.z = 1; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstR); \\\n\
+    dstPos.z = rOrder; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+NV12_SH_IMPL(U8toU8,  vxc_uchar8, int4,  vxc_uchar8, 8)\n\
+NV12_SH_IMPL(U8toI8,  vxc_char8,  int4,  vxc_char8,  8)\n\
+NV12_SH_IMPL(U8toI16, vxc_short8, int4,  vxc_short8, 16)\n\
+NV12_SH_IMPL(U8toF16, vxc_half8,  half4, vxc_short8, 16)\n\
+"; /* end of pre_process_nv12_scale_vx*/
 
 static const char pre_process_rgb_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -29947,7 +29937,7 @@ IMAGE_PRE_PROCESS_COPY_8BITS(U8, vxc_uchar16)\n\
 IMAGE_PRE_PROCESS_COPY_8BITS(I8, vxc_char16)\n\
 "; /* end of pre_process_rgb_copy_vx*/
 
-static const char pre_process_yuv420_copy_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char pre_process_yuv420_copy_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniCalculateTmpR1st_4x4;\n\
 _viv_uniform VXC_512Bits uniCalculateTmpR2nd_4x4;\n\
@@ -29981,1131 +29971,921 @@ _viv_uniform VXC_512Bits uniQuantU8toU8HiR_2x8;\n\
 \n\
 _viv_uniform int bOrder;\n\
 _viv_uniform int rOrder;\n\
-_viv_uniform int zp;\n\
-_viv_uniform float outputScale;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform float output_scale;\n\
 \n\
-__kernel void pre_process_yuv420_copy_U8toU8(\n\
-    __read_only image2d_t            y_img,\n\
-    __read_only image2d_t            u_img,\n\
-    __read_only image2d_t            v_img,\n\
-    __write_only image2d_array_t    output,\n\
-        global int *                xRatio,\n\
-        global int *                yRatio,\n\
-        global int *               xOffset,\n\
-        global int *               yOffset,\n\
-               float                 rMean,\n\
-               float                 gMean,\n\
-               float                 bMean,\n\
-               float                   var,\n\
-               int         reverse_channel,\n\
-               int                   trans\n\
-    )\n\
-{\n\
-    int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);\n\
-    int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1);\n\
-    vxc_uchar16 Y;\n\
-    vxc_uchar8 U, V;\n\
-    vxc_int4 C0, C1, C2, C3;\n\
-    vxc_uchar16 R, G, B;\n\
-    vxc_uchar16 dst0, dst1, dst2;\n\
+#define YUV420_COPY_SH_IMPL(name, dst_type) \\\n\
+__kernel void pre_process_yuv420_copy_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t y_img, \\\n\
+    __read_only  image2d_array_t u_img, \\\n\
+    __read_only  image2d_array_t v_img, \\\n\
+    __write_only image2d_array_t output, \\\n\
+          global int *           xRatio, \\\n\
+          global int *           yRatio, \\\n\
+          global int *           xOffset, \\\n\
+          global int *           yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           var, \\\n\
+                 int             reverse_channel, \\\n\
+                 int             trans \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); \\\n\
+    int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1); \\\n\
+    vxc_uchar16 Y; \\\n\
+    vxc_uchar8 U, V; \\\n\
+    vxc_int4 C0, C1, C2, C3; \\\n\
+    vxc_uchar16 R, G, B; \\\n\
+    dst_type dst0, dst1, dst2; \\\n\
+ \\\n\
+    VXC_ReadImage(Y, y_img, pos.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(U, u_img, pos1.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, pos1.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    /*C = Y - 16;*/ \\\n\
+    /*D = U - 128;*/ \\\n\
+    /*E = V - 128;*/ \\\n\
+    /* calculate R*/ \\\n\
+    /* ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]*/ \\\n\
+    int tmpV = -56992; \\\n\
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4); \\\n\
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4); \\\n\
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4); \\\n\
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4); \\\n\
+ \\\n\
+    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\
+    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\
+    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\
+    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\
+ \\\n\
+    /* calculate G*/ \\\n\
+    /* ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]*/ \\\n\
+    /* 298Y - 208V*/ \\\n\
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4); \\\n\
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4); \\\n\
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4); \\\n\
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4); \\\n\
+    /* 34784 - 100U*/ \\\n\
+    ushort tmpG = 34784; \\\n\
+    vxc_ushort8 tmpDstG; \\\n\
+    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \\\n\
+    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); \\\n\
+    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); \\\n\
+    VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4); \\\n\
+    VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4); \\\n\
+ \\\n\
+    /* calculate B*/ \\\n\
+    /* ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]*/ \\\n\
+    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4); \\\n\
+    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4); \\\n\
+    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4); \\\n\
+    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4); \\\n\
+    tmpV = -70688; \\\n\
+    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
+    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
+    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
+    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
+ \\\n\
+    var *= output_scale; \\\n\
+    float4  paramData = (float4)(bMean * var - output_zp, gMean * var - output_zp, \\\n\
+        rMean * var - output_zp, var); \\\n\
+    half4 paramData_f16; \\\n\
+    _viv_asm(CONV, paramData_f16, paramData); \\\n\
+ \\\n\
+    VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); \\\n\
+    VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); \\\n\
+ \\\n\
+    VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); \\\n\
+    VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); \\\n\
+ \\\n\
+    VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); \\\n\
+    VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); \\\n\
+ \\\n\
+    pos = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\
+    pos.z = bOrder; \\\n\
+    VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    pos.z = 1; \\\n\
+    VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    pos.z = rOrder; \\\n\
+    VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+YUV420_COPY_SH_IMPL(U8toU8, vxc_uchar16)\n\
+YUV420_COPY_SH_IMPL(U8toI8, vxc_char16)\n\
 \n\
-    VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+#define YUV420_COPY_16BITS_SH_IMPL(name, dst_type) \\\n\
+__kernel void pre_process_yuv420_copy_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t y_img, \\\n\
+    __read_only  image2d_array_t u_img, \\\n\
+    __read_only  image2d_array_t v_img, \\\n\
+    __write_only image2d_array_t output, \\\n\
+          global int *           xRatio, \\\n\
+          global int *           yRatio, \\\n\
+          global int *           xOffset, \\\n\
+          global int *           yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           var, \\\n\
+                 int             reverse_channel, \\\n\
+                 int             trans \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0); \\\n\
+    int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1); \\\n\
+    vxc_uchar16 Y; \\\n\
+    vxc_uchar8 U, V; \\\n\
+    vxc_int4 C0, C1, C2, C3; \\\n\
+    vxc_uchar16 R, G, B; \\\n\
+    dst_type dst0, dst1, dst2, dst3, dst4, dst5; \\\n\
+    vxc_short8 out0, out1, out2, out3, out4, out5; \\\n\
+ \\\n\
+    VXC_ReadImage(Y, y_img, pos.xy, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(U, u_img, pos1.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, pos1.xy, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    int tmpV = -56992; \\\n\
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4); \\\n\
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4); \\\n\
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4); \\\n\
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4); \\\n\
+ \\\n\
+    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\
+    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\
+    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\
+    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\
+ \\\n\
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4); \\\n\
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4); \\\n\
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4); \\\n\
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4); \\\n\
+ \\\n\
+    ushort tmpG = 34784; \\\n\
+    vxc_ushort8 tmpDstG; \\\n\
+    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \\\n\
+    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4); \\\n\
+    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4); \\\n\
+    VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4); \\\n\
+    VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4); \\\n\
+ \\\n\
+    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4); \\\n\
+    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4); \\\n\
+    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4); \\\n\
+    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4); \\\n\
+    tmpV = -70688; \\\n\
+    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
+    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
+    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
+    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
+ \\\n\
+    var *= output_scale; \\\n\
+    float4  paramData = (float4)(bMean * var - output_zp, gMean * var - output_zp, \\\n\
+        rMean * var - output_zp, var); \\\n\
+    half4 paramData_f16; \\\n\
+    _viv_asm(CONV, paramData_f16, paramData); \\\n\
+ \\\n\
+    VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8); \\\n\
+    VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8); \\\n\
+ \\\n\
+    VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8); \\\n\
+    VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8); \\\n\
+ \\\n\
+    VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8); \\\n\
+    VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8); \\\n\
+ \\\n\
+    _viv_asm(COPY, out0, dst0, 16); \\\n\
+    _viv_asm(COPY, out1, dst1, 16); \\\n\
+    _viv_asm(COPY, out2, dst2, 16); \\\n\
+    _viv_asm(COPY, out3, dst3, 16); \\\n\
+    _viv_asm(COPY, out4, dst4, 16); \\\n\
+    _viv_asm(COPY, out5, dst5, 16); \\\n\
+ \\\n\
+    pos = (int4)(get_global_id(0), get_global_id(1), bOrder, get_global_id(0) + 8); \\\n\
+    VXC_WriteImage2DArray(output, pos.xyzz, out0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_WriteImage2DArray(output, pos.wyzz, out1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    pos.z = 1; \\\n\
+    VXC_WriteImage2DArray(output, pos.xyzz, out2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_WriteImage2DArray(output, pos.wyzz, out3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    pos.z = rOrder; \\\n\
+    VXC_WriteImage2DArray(output, pos.xyzz, out4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_WriteImage2DArray(output, pos.wyzz, out5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+YUV420_COPY_16BITS_SH_IMPL(U8toF16, vxc_half8)\n\
+YUV420_COPY_16BITS_SH_IMPL(U8toI16, vxc_short8)\n\
+"; /* end of pre_process_yuv420_copy_vx*/
+
+static const char pre_process_yuv420_scale_0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
-    //C = Y - 16;\n\
-    //D = U - 128;\n\
-    //E = V - 128;\n\
-    // calculate R\n\
-    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]\n\
-    int tmpV = -56992;\n\
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);\n\
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);\n\
+_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\
 \n\
-    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
+_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\
 \n\
-    // calculate G\n\
-    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\
-    // 298Y - 208V\n\
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);\n\
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);\n\
-    // 34784 - 100U\n\
-    ushort tmpG = 34784;\n\
-    vxc_ushort8 tmpDstG;\n\
-    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\
-    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);\n\
-    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);\n\
-    VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4);\n\
-    VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4);\n\
+_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\
 \n\
-    // calculate B\n\
-    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);\n\
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);\n\
-    tmpV = -70688;\n\
-    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
+_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\
 \n\
-    var *= outputScale;\n\
-    float4  paramData = (float4)(bMean * var - zp, gMean * var - zp,\\\n\
-        rMean * var - zp, var);\n\
-    half4 paramData_f16;\n\
-    _viv_asm(CONV, paramData_f16, paramData);\n\
+_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\
+_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\
+_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\
+_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\
 \n\
-    VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);\n\
-    VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);\n\
+_viv_uniform int bOrder;\n\
+_viv_uniform int rOrder;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform float output_scale;\n\
 \n\
-    VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);\n\
-    VXC_DP2x8(dst1, G, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);\n\
+#define YUV420_SCALE_8BITS_SH_IMPL(name, dst_type) \\\n\
+__kernel void pre_process_yuv420_scale_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t y_img, \\\n\
+    __read_only  image2d_array_t u_img, \\\n\
+    __read_only  image2d_array_t v_img, \\\n\
+    __write_only image2d_array_t output, \\\n\
+          global int *           xRatio, \\\n\
+          global int *           yRatio, \\\n\
+          global int *           xOffset, \\\n\
+          global int *           yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           var, \\\n\
+                 int             reverse_channel, \\\n\
+                 int             trans \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    gidx += (int4)(0, 1, 2, 3); \\\n\
+ \\\n\
+    int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); \\\n\
+    int4 sx = fx & 0xffff8000; \\\n\
+    int fy, sy; \\\n\
+    fx -= sx; \\\n\
+    sx = sx >> 15; \\\n\
+    fx = (fx +(1 << 4)) >> 5; \\\n\
+ \\\n\
+    fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); \\\n\
+    sy = fy & 0xffff8000; \\\n\
+    fy -= sy; \\\n\
+    sy = sy >> 15; \\\n\
+ \\\n\
+    sy = sy < 0 ? 0 : sy; \\\n\
+    fy = fy < 0 ? 0 : fy; \\\n\
+ \\\n\
+    fy = (fy + (1<< 4)) >> 5; \\\n\
+    sx += (*xOffset); \\\n\
+    sy += (*yOffset); \\\n\
+    int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0); \\\n\
+    int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0); \\\n\
+    int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0); \\\n\
+ \\\n\
+    vxc_uchar16 Y, U, V; \\\n\
+    vxc_int4 C0, C1, C2, C3; \\\n\
+    vxc_uchar16 R, G, B; \\\n\
+ \\\n\
+    VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    srcPos1.x = (sx.x + 1) >> 1; \\\n\
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+    srcPos2.x = (sx.x + 1) >> 1; \\\n\
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    srcPos.x = sx.y; \\\n\
+    srcPos1.x = sx.y >> 1; \\\n\
+    srcPos2.x = sx.y >> 1; \\\n\
+    VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \\\n\
+    srcPos1.x = (sx.y + 1) >> 1; \\\n\
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \\\n\
+    srcPos2.x = (sx.y + 1) >> 1; \\\n\
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    srcPos.x = sx.z; \\\n\
+    srcPos1.x = sx.z >> 1; \\\n\
+    srcPos2.x = sx.z >> 1; \\\n\
+    VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); \\\n\
+    srcPos1.x = (sx.z + 1) >> 1; \\\n\
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); \\\n\
+    srcPos2.x = (sx.z + 1) >> 1; \\\n\
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    srcPos.x = sx.w; \\\n\
+    srcPos1.x = sx.w >> 1; \\\n\
+    srcPos2.x = sx.w >> 1; \\\n\
+    VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); \\\n\
+    srcPos1.x = (sx.w + 1) >> 1; \\\n\
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); \\\n\
+    srcPos2.x = (sx.w + 1) >> 1; \\\n\
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    int tmpV = -56992; \\\n\
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); \\\n\
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); \\\n\
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); \\\n\
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); \\\n\
+    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\
+    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\
+    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\
+    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\
+ \\\n\
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); \\\n\
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); \\\n\
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); \\\n\
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); \\\n\
+ \\\n\
+    ushort tmpG = 34784; \\\n\
+    vxc_ushort8 tmpDstG, tmpDstG1; \\\n\
+    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \\\n\
+    VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); \\\n\
+    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \\\n\
+    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \\\n\
+    VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \\\n\
+    VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \\\n\
+ \\\n\
+    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); \\\n\
+    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); \\\n\
+    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); \\\n\
+    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); \\\n\
+    tmpV = -70688; \\\n\
+    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
+    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
+    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
+    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
+ \\\n\
+    int4 result, temp1, temp2; \\\n\
+    int4 tmpData0, tmpData1; \\\n\
+ \\\n\
+    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\
+    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\
+    temp1 = fx * tmpData0 + tmpData1; \\\n\
+ \\\n\
+    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\
+    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\
+    temp2 = fx * tmpData0 + tmpData1; \\\n\
+    result = fy * temp2 + (temp1 << 10); \\\n\
+ \\\n\
+    tmpV = 1 << 19; \\\n\
+    dst_type dst; \\\n\
+    float4 tmpDst; \\\n\
+    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
+    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
+    tmpDst = (tmpDst - bMean) * var; \\\n\
+    dstPos.z = bOrder; \\\n\
+    result = convert_int4_rte(tmpDst * output_scale + output_zp); \\\n\
+    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\
+    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\
+    temp1 = fx * tmpData0 + tmpData1; \\\n\
+    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\
+    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\
+    temp2 = fx * tmpData0 + tmpData1; \\\n\
+    result = fy * temp2 + (temp1 << 10); \\\n\
+    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
+    tmpDst = (tmpDst - gMean) * var; \\\n\
+    dstPos.z = 1; \\\n\
+    result = convert_int4_rte(tmpDst * output_scale + output_zp); \\\n\
+    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\
+    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\
+    temp1 = fx * tmpData0 + tmpData1; \\\n\
+    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\
+    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\
+    temp2 = fx * tmpData0 + tmpData1; \\\n\
+    result = fy * temp2 + (temp1 << 10); \\\n\
+    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
+    tmpDst = (tmpDst - rMean) * var; \\\n\
+    dstPos.z = rOrder; \\\n\
+    result = convert_int4_rte(tmpDst * output_scale + output_zp); \\\n\
+    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+YUV420_SCALE_8BITS_SH_IMPL(U8toU8, vxc_uchar8)\n\
+YUV420_SCALE_8BITS_SH_IMPL(U8toI8, vxc_char8)\n\
+"; /* end of pre_process_yuv420_scale_0_vx*/
+
+static const char pre_process_yuv420_scale_1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
-    VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\
-    VXC_DP2x8(dst2, R, paramData_f16, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\
+_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\
 \n\
-    pos = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
-    pos.z = bOrder;\n\
-    VXC_WriteImage2DArray(output, pos, dst0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    pos.z = 1;\n\
-    VXC_WriteImage2DArray(output, pos, dst1, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    pos.z = rOrder;\n\
-    VXC_WriteImage2DArray(output, pos, dst2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\
+_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\
+_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\
+_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\
+_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\
+\n\
+_viv_uniform int bOrder;\n\
+_viv_uniform int rOrder;\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
+\n\
+#define YUV420_SCALE_16BITS_SH_IMPL(name, dst_type, conv_type) \\\n\
+__kernel void pre_process_yuv420_scale_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t y_img, \\\n\
+    __read_only  image2d_array_t u_img, \\\n\
+    __read_only  image2d_array_t v_img, \\\n\
+    __write_only image2d_array_t output, \\\n\
+          global int *           xRatio, \\\n\
+          global int *           yRatio, \\\n\
+          global int *           xOffset, \\\n\
+          global int *           yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           var, \\\n\
+                 int             reverse_channel, \\\n\
+                 int             trans \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    gidx += (int4)(0, 1, 2, 3); \\\n\
+ \\\n\
+    int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14); \\\n\
+    int4 sx = fx & 0xffff8000; \\\n\
+    int fy, sy; \\\n\
+    fx -= sx; \\\n\
+    sx = sx >> 15; \\\n\
+    fx = (fx +(1 << 4)) >> 5; \\\n\
+ \\\n\
+    fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14); \\\n\
+    sy = fy & 0xffff8000; \\\n\
+    fy -= sy; \\\n\
+    sy = sy >> 15; \\\n\
+ \\\n\
+    sy = sy < 0 ? 0 : sy; \\\n\
+    fy = fy < 0 ? 0 : fy; \\\n\
+ \\\n\
+    fy = (fy + (1<< 4)) >> 5; \\\n\
+    sx += (*xOffset); \\\n\
+    sy += (*yOffset); \\\n\
+    int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0); \\\n\
+    int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0); \\\n\
+    int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0); \\\n\
+ \\\n\
+    vxc_uchar16 Y, U, V; \\\n\
+    vxc_int4 C0, C1, C2, C3; \\\n\
+    vxc_uchar16 R, G, B; \\\n\
+ \\\n\
+    VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    srcPos1.x = (sx.x + 1) >> 1; \\\n\
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+    srcPos2.x = (sx.x + 1) >> 1; \\\n\
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    srcPos.x = sx.y; \\\n\
+    srcPos1.x = sx.y >> 1; \\\n\
+    srcPos2.x = sx.y >> 1; \\\n\
+    VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \\\n\
+    srcPos1.x = (sx.y + 1) >> 1; \\\n\
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \\\n\
+    srcPos2.x = (sx.y + 1) >> 1; \\\n\
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    srcPos.x = sx.z; \\\n\
+    srcPos1.x = sx.z >> 1; \\\n\
+    srcPos2.x = sx.z >> 1; \\\n\
+    VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0)); \\\n\
+    srcPos1.x = (sx.z + 1) >> 1; \\\n\
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0)); \\\n\
+    srcPos2.x = (sx.z + 1) >> 1; \\\n\
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    srcPos.x = sx.w; \\\n\
+    srcPos1.x = sx.w >> 1; \\\n\
+    srcPos2.x = sx.w >> 1; \\\n\
+    VXC_ReadImage(Y, y_img, srcPos, 0, VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0)); \\\n\
+    srcPos1.x = (sx.w + 1) >> 1; \\\n\
+    VXC_ReadImage(U, u_img, srcPos1, 0, VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos1, 0, VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0)); \\\n\
+    srcPos2.x = (sx.w + 1) >> 1; \\\n\
+    VXC_ReadImage(U, u_img, srcPos2, 0, VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_ReadImage(V, v_img, srcPos2, 0, VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    int tmpV = -56992; \\\n\
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4); \\\n\
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4); \\\n\
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4); \\\n\
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4); \\\n\
+    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\
+    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\
+    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\
+    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4); \\\n\
+ \\\n\
+    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4); \\\n\
+    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4); \\\n\
+    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4); \\\n\
+    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4); \\\n\
+ \\\n\
+    ushort tmpG = 34784; \\\n\
+    vxc_ushort8 tmpDstG, tmpDstG1; \\\n\
+    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8); \\\n\
+    VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8); \\\n\
+    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \\\n\
+    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \\\n\
+    VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4); \\\n\
+    VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4); \\\n\
+ \\\n\
+    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4); \\\n\
+    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4); \\\n\
+    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4); \\\n\
+    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4); \\\n\
+    tmpV = -70688; \\\n\
+    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
+    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
+    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
+    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4); \\\n\
+ \\\n\
+    int4 result, temp1, temp2; \\\n\
+    int4 tmpData0, tmpData1; \\\n\
+    dst_type tmpResult; \\\n\
+    conv_type tmpVal; \\\n\
+ \\\n\
+    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\
+    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\
+    temp1 = fx * tmpData0 + tmpData1; \\\n\
+ \\\n\
+    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\
+    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\
+    temp2 = fx * tmpData0 + tmpData1; \\\n\
+    result = fy * temp2 + (temp1 << 10); \\\n\
+ \\\n\
+    tmpV = 1 << 19; \\\n\
+    vxc_short8 dst; \\\n\
+    float4 tmpDst; \\\n\
+    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0); \\\n\
+    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
+    tmpDst = (tmpDst - bMean) * var; \\\n\
+    dstPos.z = bOrder; \\\n\
+    tmpDst = tmpDst * output_scale + output_zp; \\\n\
+    _viv_asm(CONV_RTE, tmpVal, tmpDst); \\\n\
+    VXC_DP2x8(tmpResult, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, tmpResult, 8); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\
+    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\
+    temp1 = fx * tmpData0 + tmpData1; \\\n\
+    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\
+    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\
+    temp2 = fx * tmpData0 + tmpData1; \\\n\
+    result = fy * temp2 + (temp1 << 10); \\\n\
+    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
+    tmpDst = (tmpDst - gMean) * var; \\\n\
+    dstPos.z = 1; \\\n\
+    tmpDst = tmpDst * output_scale + output_zp; \\\n\
+    _viv_asm(CONV_RTE, tmpVal, tmpDst); \\\n\
+    VXC_DP2x8(tmpResult, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, tmpResult, 8); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4); \\\n\
+    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4); \\\n\
+    temp1 = fx * tmpData0 + tmpData1; \\\n\
+    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4); \\\n\
+    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4); \\\n\
+    temp2 = fx * tmpData0 + tmpData1; \\\n\
+    result = fy * temp2 + (temp1 << 10); \\\n\
+    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4); \\\n\
+    tmpDst = (tmpDst - rMean) * var; \\\n\
+    dstPos.z = rOrder; \\\n\
+    tmpDst = tmpDst * output_scale + output_zp; \\\n\
+    _viv_asm(CONV_RTE, tmpVal, tmpDst); \\\n\
+    VXC_DP2x8(tmpResult, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, tmpResult, 8); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+YUV420_SCALE_16BITS_SH_IMPL(U8toF16, vxc_half8,  half4)\n\
+YUV420_SCALE_16BITS_SH_IMPL(U8toI16, vxc_short8, int4)\n\
+"; /* end of pre_process_yuv420_scale_1_vx*/
+
+static const char pre_process_yuv422_copy_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int bOrder;\n\
+_viv_uniform int rOrder;\n\
+\n\
+_viv_uniform float outputScaleVar;\n\
+_viv_uniform float bMeanScaleVarZp;\n\
+_viv_uniform float gMeanScaleVarZp;\n\
+_viv_uniform float rMeanScaleVarZp;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertYUV422toB_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertYUV422toG_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertYUV422toR_4x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;\n\
+\n\
+#define YUV422_COPY_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \\\n\
+__kernel void pre_process_yuv422_copy_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    global       int*            xRatio, \\\n\
+    global       int*            yRatio, \\\n\
+    global       int*            xOffset, \\\n\
+    global       int*            yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           var, \\\n\
+                 int             reverse_channel, \\\n\
+                 int             trans, \\\n\
+                 int             yuv422_type \\\n\
+    ) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+ \\\n\
+    int sy = gidy + (*yOffset); \\\n\
+    int sx = gidx + (*xOffset * 2); \\\n\
+ \\\n\
+    vxc_uchar8 YUV; \\\n\
+    vxc_short8 tmpYUV; \\\n\
+ \\\n\
+    VXC_ReadImage(YUV, input, (int2)(sx,sy), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    if (yuv422_type == 1) \\\n\
+    { \\\n\
+        YUV.s01234567 = YUV.s10325476; \\\n\
+    } \\\n\
+\\\n\
+    short tmpVal = 128; \\\n\
+    VXC_DP2x8(tmpYUV, YUV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \\\n\
+ \\\n\
+    float4 tmpDstB, tmpDstG, tmpDstR; \\\n\
+    VXC_DP4x4(tmpDstB, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \\\n\
+    VXC_DP4x4(tmpDstG, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \\\n\
+    VXC_DP4x4(tmpDstR, tmpYUV, tmpYUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \\\n\
+ \\\n\
+    conv_type result; \\\n\
+    dst_type dst0; \\\n\
+    save_type dst; \\\n\
+    int4 dstPos = (int4)(gidx, gidy, 0, 0); \\\n\
+    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstB); \\\n\
+    dstPos.z = bOrder; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstG); \\\n\
+    dstPos.z = 1; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstR); \\\n\
+    dstPos.z = rOrder; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+YUV422_COPY_SH_IMPL(U8toU8,  vxc_uchar4, int4,  vxc_uchar4, 4)\n\
+YUV422_COPY_SH_IMPL(U8toI8,  vxc_char4,  int4,  vxc_char4,  4)\n\
+YUV422_COPY_SH_IMPL(U8toI16, vxc_short4, int4,  vxc_short4, 8)\n\
+YUV422_COPY_SH_IMPL(U8toF16, vxc_half4,  half4, vxc_short4, 8)\n\
+"; /* end of pre_process_yuv422_copy_vx*/
+
+static const char pre_process_yuv422_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int bOrder;\n\
+_viv_uniform int rOrder;\n\
+\n\
+_viv_uniform float outputScaleVar;\n\
+_viv_uniform float bMeanScaleVarZp;\n\
+_viv_uniform float gMeanScaleVarZp;\n\
+_viv_uniform float rMeanScaleVarZp;\n\
+\n\
+_viv_uniform uint  xrIntFloat_16;\n\
+_viv_uniform uint  yrIntFloat_16;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertYUV422toB_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertYUV422toG_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertYUV422toR_4x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractUVtoCharSub128_2x8;\n\
+\n\
+#define uyvy422 1\n\
+\n\
+#define YUV422_SH_IMPL(name, dst_type, conv_type, save_type, copy_bytes) \\\n\
+__kernel void pre_process_yuv422_scale_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __write_only image2d_array_t output, \\\n\
+    global       int*            xRatio, \\\n\
+    global       int*            yRatio, \\\n\
+    global       int*            xOffset, \\\n\
+    global       int*            yOffset, \\\n\
+                 float           rMean, \\\n\
+                 float           gMean, \\\n\
+                 float           bMean, \\\n\
+                 float           var, \\\n\
+                 int             reverse_channel, \\\n\
+                 int             trans, \\\n\
+                 int             yuv422_type \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    gidx += (int4)(0, 1, 2, 3); \\\n\
+ \\\n\
+    uint dy = (convert_uint(gidy) * yrIntFloat_16) >> 16; \\\n\
+    uint4 dx = (convert_uint4(gidx) * xrIntFloat_16) >> 16; \\\n\
+    int sy = convert_int(dy) + (*yOffset); \\\n\
+    int4 sx = convert_int4(dx)+ (*xOffset * 2); \\\n\
+ \\\n\
+    vxc_uchar4 Y; \\\n\
+    vxc_uchar8 UV; \\\n\
+    vxc_char8 tmpUV; \\\n\
+    short tmpVal = 128; \\\n\
+    int y_offset = 0; \\\n\
+    int u_offset = 1; \\\n\
+    int v_offset = 3; \\\n\
+\\\n\
+    if (yuv422_type == uyvy422) \\\n\
+    { \\\n\
+        y_offset = 1; \\\n\
+        u_offset = 0; \\\n\
+        v_offset = 2; \\\n\
+    } \\\n\
+\\\n\
+    int4 coord_Y = (int4)(sx.x * 2 + y_offset, sy, 0, 0); \\\n\
+    int4 coord_U = (int4)((sx.x >> 1) * 4 + u_offset, sy, 0, 0); \\\n\
+    int4 coord_V = (int4)((sx.x >> 1) * 4 + v_offset, sy, 0, 0); \\\n\
+\\\n\
+    VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_Y.x = sx.y * 2 + y_offset; \\\n\
+    VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_Y.x = sx.z * 2 + y_offset; \\\n\
+    VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_Y.x = sx.w * 2 + y_offset; \\\n\
+    VXC_ReadImage2DArray(Y, input, coord_Y, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    sx = (sx >> 1) * 4 + u_offset; \\\n\
+    VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_U.x = sx.y; \\\n\
+    VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_U.x = sx.z; \\\n\
+    VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_U.x = sx.w; \\\n\
+    VXC_ReadImage2DArray(UV, input, coord_U, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+\\\n\
+    sx = sx - u_offset + v_offset; \\\n\
+    VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_V.x = sx.y; \\\n\
+    VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_V.x = sx.z; \\\n\
+    VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0)); \\\n\
+    coord_V.x = sx.w; \\\n\
+    VXC_ReadImage2DArray(UV, input, coord_V, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_DP2x8(tmpUV, UV, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractUVtoCharSub128_2x8); \\\n\
+    vxc_uchar4 dst_test; \\\n\
+    VXC_DP2x8(dst_test, dx, dx, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\
+\\\n\
+    float4 tmpDstB, tmpDstG, tmpDstR; \\\n\
+    VXC_DP4x4(tmpDstB, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toB_4x4); \\\n\
+    VXC_DP4x4(tmpDstG, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toG_4x4); \\\n\
+    VXC_DP4x4(tmpDstR, Y, tmpUV, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertYUV422toR_4x4); \\\n\
+ \\\n\
+    conv_type result; \\\n\
+    dst_type dst0; \\\n\
+    save_type dst; \\\n\
+    int4 dstPos = (int4)(gidx.x, gidy, 0, 0); \\\n\
+    tmpDstB = tmpDstB * outputScaleVar + bMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstB); \\\n\
+    dstPos.z = bOrder; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    tmpDstG = tmpDstG * outputScaleVar + gMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstG); \\\n\
+    dstPos.z = 1; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    tmpDstR = tmpDstR * outputScaleVar + rMeanScaleVarZp; \\\n\
+    _viv_asm(CONV_RTE, result, tmpDstR); \\\n\
+    dstPos.z = rOrder; \\\n\
+    VXC_DP2x8(dst0, result, result, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 0), uniExtract8Data_2x8); \\\n\
+    _viv_asm(COPY, dst, dst0, copy_bytes); \\\n\
+    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
 }\n\
 \n\
-__kernel void pre_process_yuv420_copy_U8toF16(\n\
-    __read_only image2d_t            y_img,\n\
-    __read_only image2d_t            u_img,\n\
-    __read_only image2d_t            v_img,\n\
-    __write_only image2d_array_t    output,\n\
-        global int *                xRatio,\n\
-        global int *                yRatio,\n\
-        global int *               xOffset,\n\
-        global int *               yOffset,\n\
-               float                 rMean,\n\
-               float                 gMean,\n\
-               float                 bMean,\n\
-               float                   var,\n\
-               int         reverse_channel,\n\
-               int                   trans\n\
-    )\n\
-{\n\
-    int4 pos = (int4)(get_global_id(0) + (*xOffset), get_global_id(1) + (*yOffset), 0, 0);\n\
-    int2 pos1 = (int2)((get_global_id(0) + (*xOffset)) >> 1, (get_global_id(1) + (*yOffset)) >> 1);\n\
-    vxc_uchar16 Y;\n\
-    vxc_uchar8 U, V;\n\
-    vxc_int4 C0, C1, C2, C3;\n\
-    vxc_uchar16 R, G, B;\n\
-    vxc_half8 dst0, dst1, dst2, dst3, dst4, dst5;\n\
-    vxc_short8 out0, out1, out2, out3, out4, out5;\n\
-\n\
-    VXC_ReadImage(Y, y_img, pos.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, pos1.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    //C = Y - 16;\n\
-    //D = U - 128;\n\
-    //E = V - 128;\n\
-    // calculate R\n\
-    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]\n\
-    int tmpV = -56992;\n\
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR1st_4x4);\n\
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpR4th_4x4);\n\
-\n\
-    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-\n\
-    // calculate G\n\
-    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\
-    // 298Y - 208V\n\
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG1st_4x4);\n\
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpG4th_4x4);\n\
-    // 34784 - 100U\n\
-    ushort tmpG = 34784;\n\
-    vxc_ushort8 tmpDstG;\n\
-    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\
-    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateG1st_4x4);\n\
-    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateG2nd_4x4);\n\
-    VXC_DP4x4(G, C2, tmpDstG, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateG3rd_4x4);\n\
-    VXC_DP4x4(G, C3, tmpDstG, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateG4th_4x4);\n\
-\n\
-    // calculate B\n\
-    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB1st_4x4);\n\
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpB4th_4x4);\n\
-    tmpV = -70688;\n\
-    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-\n\
-    float4  paramData = (float4)(bMean * var, gMean * var,\\\n\
-        rMean * var, var);\n\
-    half4 paramData_f16;\n\
-    _viv_asm(CONV, paramData_f16, paramData);\n\
-\n\
-    VXC_DP2x8(dst0, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoB_2x8);\n\
-    VXC_DP2x8(dst1, B, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiB_2x8);\n\
-\n\
-    VXC_DP2x8(dst2, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoG_2x8);\n\
-    VXC_DP2x8(dst3, G, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiG_2x8);\n\
-\n\
-    VXC_DP2x8(dst4, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8LoR_2x8);\n\
-    VXC_DP2x8(dst5, R, paramData_f16, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniQuantU8toU8HiR_2x8);\n\
-\n\
-    _viv_asm(COPY, out0, dst0, 16);\n\
-    _viv_asm(COPY, out1, dst1, 16);\n\
-    _viv_asm(COPY, out2, dst2, 16);\n\
-    _viv_asm(COPY, out3, dst3, 16);\n\
-    _viv_asm(COPY, out4, dst4, 16);\n\
-    _viv_asm(COPY, out5, dst5, 16);\n\
-\n\
-    pos = (int4)(get_global_id(0), get_global_id(1), bOrder, get_global_id(0) + 8);\n\
-    VXC_WriteImage2DArray(output, pos.xyzz, out0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage2DArray(output, pos.wyzz, out1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    pos.z = 1;\n\
-    VXC_WriteImage2DArray(output, pos.xyzz, out2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage2DArray(output, pos.wyzz, out3, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    pos.z = rOrder;\n\
-    VXC_WriteImage2DArray(output, pos.xyzz, out4, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage2DArray(output, pos.wyzz, out5, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-}\n\
-\n\
-"; /* end of pre_process_yuv420_copy_u8_vx*/
-
-static const char pre_process_yuv420_scale_fp16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\
-_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniConvertHalftoFp16_2x8;\n\
-\n\
-_viv_uniform int bOrder;\n\
-_viv_uniform int rOrder;\n\
-\n\
-__kernel void pre_process_yuv420_scale_U8toF16(\n\
-    __read_only image2d_array_t y_img, __read_only image2d_array_t u_img,\n\
-    __read_only image2d_array_t v_img, __write_only image2d_array_t    output,\n\
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\
-{\n\
-    int4 gidx = get_global_id(0);\n\
-    int gidy = get_global_id(1);\n\
-    gidx += (int4)(0, 1, 2, 3);\n\
-\n\
-    int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);\n\
-    int4 sx = fx & 0xffff8000; // Floor\n\
-    int fy, sy;\n\
-    fx -= sx;\n\
-    sx = sx >> 15;\n\
-    fx = (fx +(1 << 4)) >> 5;\n\
-\n\
-    // for y\n\
-    fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);\n\
-    sy = fy & 0xffff8000; // Floor\n\
-    fy -= sy;\n\
-    sy = sy >> 15;\n\
-\n\
-    sy = sy < 0 ? 0 : sy;\n\
-    fy = fy < 0 ? 0 : fy;\n\
-\n\
-    fy = (fy + (1<< 4)) >> 5;\n\
-    sx += (*xOffset);\n\
-    sy += (*yOffset);\n\
-    int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);\n\
-    int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);\n\
-    int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);\n\
-\n\
-    vxc_uchar16 Y, U, V;\n\
-    vxc_int4 C0, C1, C2, C3;\n\
-    vxc_uchar16 R, G, B;\n\
-\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos1.x = (sx.x + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos2.x = (sx.x + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    srcPos.x = sx.y;\n\
-    srcPos1.x = sx.y >> 1;\n\
-    srcPos2.x = sx.y >> 1;\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos1.x = (sx.y + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos2.x = (sx.y + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    srcPos.x = sx.z;\n\
-    srcPos1.x = sx.z >> 1;\n\
-    srcPos2.x = sx.z >> 1;\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos1.x = (sx.z + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos2.x = (sx.z + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    srcPos.x = sx.w;\n\
-    srcPos1.x = sx.w >> 1;\n\
-    srcPos2.x = sx.w >> 1;\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos1.x = (sx.w + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos2.x = (sx.w + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    //C = Y - 16; D = U - 128; E = V - 128;\n\
-    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]\n\
-    int tmpV = -56992;\n\
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);\n\
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);\n\
-    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-\n\
-    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\
-    // 298Y - 208V\n\
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);\n\
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);\n\
-    // 34784 - 100U\n\
-    ushort tmpG = 34784;\n\
-    vxc_ushort8 tmpDstG, tmpDstG1;\n\
-    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\
-    VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);\n\
-    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\
-    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\
-    VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\
-    VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\
-\n\
-    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);\n\
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);\n\
-    tmpV = -70688;\n\
-    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-\n\
-    int4 result, temp1, temp2;\n\
-    int4 tmpData0, tmpData1;\n\
-\n\
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\
-    temp1 = fx * tmpData0 + tmpData1;\n\
-    // temp2 - temp1\n\
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\
-    temp2 = fx * tmpData0 + tmpData1;\n\
-    result = fy * temp2 + (temp1 << 10);\n\
-\n\
-    vxc_half8 tmpVal;\n\
-    half4 hDst;\n\
-    tmpV = 1 << 19;\n\
-    vxc_short8 dst;\n\
-    float4 tmpDst;\n\
-    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
-    tmpDst = (tmpDst - bMean) * var;\n\
-    dstPos.z = bOrder;\n\
-    _viv_asm(CONV, hDst, tmpDst);\n\
-    VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\
-    _viv_asm(COPY, dst, tmpVal, 16);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\
-    temp1 = fx * tmpData0 + tmpData1;\n\
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\
-    temp2 = fx * tmpData0 + tmpData1;\n\
-    result = fy * temp2 + (temp1 << 10);\n\
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
-    tmpDst = (tmpDst - gMean) * var;\n\
-    dstPos.z = 1;\n\
-    _viv_asm(CONV, hDst, tmpDst);\n\
-    VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\
-    _viv_asm(COPY, dst, tmpVal, 16);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\
-    temp1 = fx * tmpData0 + tmpData1;\n\
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\
-    temp2 = fx * tmpData0 + tmpData1;\n\
-    result = fy * temp2 + (temp1 << 10);\n\
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
-    tmpDst = (tmpDst - rMean) * var;\n\
-    dstPos.z = rOrder;\n\
-    _viv_asm(CONV, hDst, tmpDst);\n\
-    VXC_DP2x8(tmpVal, hDst, hDst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalftoFp16_2x8);\n\
-    _viv_asm(COPY, dst, tmpVal, 16);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-}"; /* end of pre_process_yuv420_scale_fp16_vx*/
-
-static const char pre_process_yuv420_scale_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\
-\n\
-_viv_uniform int bOrder;\n\
-_viv_uniform int rOrder;\n\
-_viv_uniform float outputScale;\n\
-\n\
-__kernel void pre_process_yuv420_scale_U8toI16(\n\
-    __read_only image2d_array_t y_img, __read_only image2d_array_t u_img,\n\
-    __read_only image2d_array_t v_img, __write_only image2d_array_t    output,\n\
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\
-{\n\
-    int4 gidx = get_global_id(0);\n\
-    int gidy = get_global_id(1);\n\
-    gidx += (int4)(0, 1, 2, 3);\n\
-\n\
-    int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);\n\
-    int4 sx = fx & 0xffff8000; // Floor\n\
-    int fy, sy;\n\
-    fx -= sx;\n\
-    sx = sx >> 15;\n\
-    fx = (fx +(1 << 4)) >> 5;\n\
-\n\
-    // for y\n\
-    fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);\n\
-    sy = fy & 0xffff8000; // Floor\n\
-    fy -= sy;\n\
-    sy = sy >> 15;\n\
-\n\
-    sy = sy < 0 ? 0 : sy;\n\
-    fy = fy < 0 ? 0 : fy;\n\
-\n\
-    fy = (fy + (1<< 4)) >> 5;\n\
-    sx += (*xOffset);\n\
-    sy += (*yOffset);\n\
-    int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);\n\
-    int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);\n\
-    int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);\n\
-\n\
-    vxc_uchar16 Y, U, V;\n\
-    vxc_int4 C0, C1, C2, C3;\n\
-    vxc_uchar16 R, G, B;\n\
-\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos1.x = (sx.x + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos2.x = (sx.x + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    srcPos.x = sx.y;\n\
-    srcPos1.x = sx.y >> 1;\n\
-    srcPos2.x = sx.y >> 1;\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos1.x = (sx.y + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos2.x = (sx.y + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    srcPos.x = sx.z;\n\
-    srcPos1.x = sx.z >> 1;\n\
-    srcPos2.x = sx.z >> 1;\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos1.x = (sx.z + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos2.x = (sx.z + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    srcPos.x = sx.w;\n\
-    srcPos1.x = sx.w >> 1;\n\
-    srcPos2.x = sx.w >> 1;\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos1.x = (sx.w + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos2.x = (sx.w + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    //C = Y - 16; D = U - 128; E = V - 128;\n\
-    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]\n\
-    int tmpV = -56992;\n\
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);\n\
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);\n\
-    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-\n\
-    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\
-    // 298Y - 208V\n\
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);\n\
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);\n\
-    // 34784 - 100U\n\
-    ushort tmpG = 34784;\n\
-    vxc_ushort8 tmpDstG, tmpDstG1;\n\
-    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\
-    VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);\n\
-    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\
-    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\
-    VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\
-    VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\
-\n\
-    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);\n\
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);\n\
-    tmpV = -70688;\n\
-    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-\n\
-    int4 result, temp1, temp2;\n\
-    int4 tmpData0, tmpData1;\n\
-\n\
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\
-    temp1 = fx * tmpData0 + tmpData1;\n\
-    // temp2 - temp1\n\
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\
-    temp2 = fx * tmpData0 + tmpData1;\n\
-    result = fy * temp2 + (temp1 << 10);\n\
-\n\
-    tmpV = 1 << 19;\n\
-    vxc_short8 dst;\n\
-    float4 tmpDst;\n\
-    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
-    tmpDst = (tmpDst - bMean) * var;\n\
-    dstPos.z = bOrder;\n\
-    result = convert_int4_rte(tmpDst * outputScale);\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\
-    temp1 = fx * tmpData0 + tmpData1;\n\
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\
-    temp2 = fx * tmpData0 + tmpData1;\n\
-    result = fy * temp2 + (temp1 << 10);\n\
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
-    tmpDst = (tmpDst - gMean) * var;\n\
-    dstPos.z = 1;\n\
-    result = convert_int4_rte(tmpDst * outputScale);\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\
-    temp1 = fx * tmpData0 + tmpData1;\n\
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\
-    temp2 = fx * tmpData0 + tmpData1;\n\
-    result = fy * temp2 + (temp1 << 10);\n\
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
-    tmpDst = (tmpDst - rMean) * var;\n\
-    dstPos.z = rOrder;\n\
-    result = convert_int4_rte(tmpDst * outputScale);\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-}"; /* end of pre_process_yuv420_scale_i16_vx*/
-
-static const char pre_process_yuv420_scale_i8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\
-\n\
-_viv_uniform int bOrder;\n\
-_viv_uniform int rOrder;\n\
-_viv_uniform float outputScale;\n\
-\n\
-__kernel void pre_process_yuv420_scale_U8toI8(\n\
-    __read_only image2d_array_t y_img, __read_only image2d_array_t u_img,\n\
-    __read_only image2d_array_t v_img, __write_only image2d_array_t    output,\n\
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\
-{\n\
-    int4 gidx = get_global_id(0);\n\
-    int gidy = get_global_id(1);\n\
-    gidx += (int4)(0, 1, 2, 3);\n\
-\n\
-    int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);\n\
-    int4 sx = fx & 0xffff8000; // Floor\n\
-    int fy, sy;\n\
-    fx -= sx;\n\
-    sx = sx >> 15;\n\
-    fx = (fx +(1 << 4)) >> 5;\n\
-\n\
-    // for y\n\
-    fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);\n\
-    sy = fy & 0xffff8000; // Floor\n\
-    fy -= sy;\n\
-    sy = sy >> 15;\n\
-\n\
-    sy = sy < 0 ? 0 : sy;\n\
-    fy = fy < 0 ? 0 : fy;\n\
-\n\
-    fy = (fy + (1<< 4)) >> 5;\n\
-    sx += (*xOffset);\n\
-    sy += (*yOffset);\n\
-    int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);\n\
-    int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);\n\
-    int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);\n\
-\n\
-    vxc_uchar16 Y, U, V;\n\
-    vxc_int4 C0, C1, C2, C3;\n\
-    vxc_uchar16 R, G, B;\n\
-\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos1.x = (sx.x + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos2.x = (sx.x + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    srcPos.x = sx.y;\n\
-    srcPos1.x = sx.y >> 1;\n\
-    srcPos2.x = sx.y >> 1;\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos1.x = (sx.y + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos2.x = (sx.y + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    srcPos.x = sx.z;\n\
-    srcPos1.x = sx.z >> 1;\n\
-    srcPos2.x = sx.z >> 1;\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos1.x = (sx.z + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos2.x = (sx.z + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    srcPos.x = sx.w;\n\
-    srcPos1.x = sx.w >> 1;\n\
-    srcPos2.x = sx.w >> 1;\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos1.x = (sx.w + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos2.x = (sx.w + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    //C = Y - 16; D = U - 128; E = V - 128;\n\
-    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]\n\
-    int tmpV = -56992;\n\
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);\n\
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);\n\
-    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-\n\
-    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\
-    // 298Y - 208V\n\
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);\n\
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);\n\
-    // 34784 - 100U\n\
-    ushort tmpG = 34784;\n\
-    vxc_ushort8 tmpDstG, tmpDstG1;\n\
-    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\
-    VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);\n\
-    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\
-    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\
-    VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\
-    VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\
-\n\
-    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);\n\
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);\n\
-    tmpV = -70688;\n\
-    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-\n\
-    int4 result, temp1, temp2;\n\
-    int4 tmpData0, tmpData1;\n\
-\n\
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\
-    temp1 = fx * tmpData0 + tmpData1;\n\
-    // temp2 - temp1\n\
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\
-    temp2 = fx * tmpData0 + tmpData1;\n\
-    result = fy * temp2 + (temp1 << 10);\n\
-\n\
-    tmpV = 1 << 19;\n\
-    vxc_char8 dst;\n\
-    float4 tmpDst;\n\
-    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
-    tmpDst = (tmpDst - bMean) * var;\n\
-    dstPos.z = bOrder;\n\
-    result = convert_int4_rte(tmpDst * outputScale);\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\
-    temp1 = fx * tmpData0 + tmpData1;\n\
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\
-    temp2 = fx * tmpData0 + tmpData1;\n\
-    result = fy * temp2 + (temp1 << 10);\n\
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
-    tmpDst = (tmpDst - gMean) * var;\n\
-    dstPos.z = 1;\n\
-    result = convert_int4_rte(tmpDst * outputScale);\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\
-    temp1 = fx * tmpData0 + tmpData1;\n\
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\
-    temp2 = fx * tmpData0 + tmpData1;\n\
-    result = fy * temp2 + (temp1 << 10);\n\
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
-    tmpDst = (tmpDst - rMean) * var;\n\
-    dstPos.z = rOrder;\n\
-    result = convert_int4_rte(tmpDst * outputScale);\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-}"; /* end of pre_process_yuv420_scale_i8_vx*/
-
-static const char pre_process_yuv420_scale_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateR1st_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU_2x8;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGbyU2nd_2x8;\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateB1st_4x4;\n\
-_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
-_viv_uniform VXC_512Bits uniDescaleU8_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpRWise4th_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpGWise4th_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateGWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateGWise2nd_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniCalculateTmpBWise4th_4x4;\n\
-\n\
-_viv_uniform VXC_512Bits uniBilinearTmp1st_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp2nd_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp3rd_4x4;\n\
-_viv_uniform VXC_512Bits uniBilinearTmp4th_4x4;\n\
-\n\
-_viv_uniform int bOrder;\n\
-_viv_uniform int rOrder;\n\
-_viv_uniform int zp;\n\
-_viv_uniform float outputScale;\n\
-\n\
-__kernel void pre_process_yuv420_scale_U8toU8(\n\
-    __read_only image2d_array_t y_img, __read_only image2d_array_t u_img,\n\
-    __read_only image2d_array_t v_img, __write_only image2d_array_t    output,\n\
-    global int *xRatio, global int * yRatio, global int * xOffset, global int * yOffset,\n\
-    float rMean, float gMean, float bMean, float var, int reverse_channel, int trans)\n\
-{\n\
-    int4 gidx = get_global_id(0);\n\
-    int gidy = get_global_id(1);\n\
-    gidx += (int4)(0, 1, 2, 3);\n\
-\n\
-    int4 fx = (gidx * (*xRatio) + ((*xRatio) >> 1)) - (1 << 14);\n\
-    int4 sx = fx & 0xffff8000; // Floor\n\
-    int fy, sy;\n\
-    fx -= sx;\n\
-    sx = sx >> 15;\n\
-    fx = (fx +(1 << 4)) >> 5;\n\
-\n\
-    // for y\n\
-    fy = (gidy * (*yRatio) + ((*yRatio) >> 1)) - (1<< 14);\n\
-    sy = fy & 0xffff8000; // Floor\n\
-    fy -= sy;\n\
-    sy = sy >> 15;\n\
-\n\
-    sy = sy < 0 ? 0 : sy;\n\
-    fy = fy < 0 ? 0 : fy;\n\
-\n\
-    fy = (fy + (1<< 4)) >> 5;\n\
-    sx += (*xOffset);\n\
-    sy += (*yOffset);\n\
-    int4 srcPos = (int4)(sx.x, sy, get_global_id(2), 0);\n\
-    int4 srcPos1 = (int4)(sx.x >> 1, sy >> 1, get_global_id(2), 0);\n\
-    int4 srcPos2 = (int4)(sx.x >> 1, (sy + 1) >> 1, get_global_id(2), 0);\n\
-\n\
-    vxc_uchar16 Y, U, V;\n\
-    vxc_int4 C0, C1, C2, C3;\n\
-    vxc_uchar16 R, G, B;\n\
-\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos1.x = (sx.x + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos2.x = (sx.x + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    srcPos.x = sx.y;\n\
-    srcPos1.x = sx.y >> 1;\n\
-    srcPos2.x = sx.y >> 1;\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos1.x = (sx.y + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos2.x = (sx.y + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    srcPos.x = sx.z;\n\
-    srcPos1.x = sx.z >> 1;\n\
-    srcPos2.x = sx.z >> 1;\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 9, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(10, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(8, 8, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos1.x = (sx.z + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(9, 9, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(10, 10, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos2.x = (sx.z + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(11, 11, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    srcPos.x = sx.w;\n\
-    srcPos1.x = sx.w >> 1;\n\
-    srcPos2.x = sx.w >> 1;\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 13, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(Y, y_img, srcPos, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(14, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(12, 12, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos1.x = (sx.w + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos1, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(13, 13, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(14, 14, 0, VXC_RM_TowardZero, 0));\n\
-    srcPos2.x = (sx.w + 1) >> 1;\n\
-    VXC_ReadImage(U, u_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_ReadImage(V, v_img, srcPos2, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(15, 15, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    //C = Y - 16; D = U - 128; E = V - 128;\n\
-    // ((298 * C + 409 * E + 128) >> 8) -->  [(298Y + 409V - 56992) >> 8]\n\
-    int tmpV = -56992;\n\
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise_4x4);\n\
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpRWise4th_4x4);\n\
-    VXC_DP4x4(R, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(R, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(R, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-    VXC_DP4x4(R, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateR1st_4x4);\n\
-\n\
-    // ((298 * C - 100* D - 208 * E + 128) >> 8) --> [(298Y - 100U - 208V + 34784) >> 8]\n\
-    // 298Y - 208V\n\
-    VXC_DP4x4(C0, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise_4x4);\n\
-    VXC_DP4x4(C1, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, V, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGWise4th_4x4);\n\
-    // 34784 - 100U\n\
-    ushort tmpG = 34784;\n\
-    vxc_ushort8 tmpDstG, tmpDstG1;\n\
-    VXC_DP2x8(tmpDstG, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU_2x8);\n\
-    VXC_DP2x8(tmpDstG1, U, tmpG, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniCalculateTmpGbyU2nd_2x8);\n\
-    VXC_DP4x4(G, C0, tmpDstG, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\
-    VXC_DP4x4(G, C1, tmpDstG, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\
-    VXC_DP4x4(G, C2, tmpDstG1, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise_4x4);\n\
-    VXC_DP4x4(G, C3, tmpDstG1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateGWise2nd_4x4);\n\
-\n\
-    // ((298 * C + 516 * D + 128) >> 8) ==> [(298Y + 516U - 70688) >> 8]\n\
-    VXC_DP4x4(C0, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise_4x4);\n\
-    VXC_DP4x4(C1, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise2nd_4x4);\n\
-    VXC_DP4x4(C2, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise3rd_4x4);\n\
-    VXC_DP4x4(C3, Y, U, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniCalculateTmpBWise4th_4x4);\n\
-    tmpV = -70688;\n\
-    VXC_DP4x4(B, C0, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(B, C1, tmpV, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(B, C2, tmpV, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-    VXC_DP4x4(B, C3, tmpV, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniCalculateB1st_4x4);\n\
-\n\
-    int4 result, temp1, temp2;\n\
-    int4 tmpData0, tmpData1;\n\
-\n\
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\
-    temp1 = fx * tmpData0 + tmpData1;\n\
-    // temp2 - temp1\n\
-    VXC_DP4x4(tmpData0, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\
-    VXC_DP4x4(tmpData1, B, B, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\
-    temp2 = fx * tmpData0 + tmpData1;\n\
-    result = fy * temp2 + (temp1 << 10);\n\
-\n\
-    tmpV = 1 << 19;\n\
-    vxc_uchar8 dst;\n\
-    float4 tmpDst;\n\
-    int4 dstPos = (int4)(get_global_id(0), gidy, 0, 0);\n\
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
-    tmpDst = (tmpDst - bMean) * var;\n\
-    dstPos.z = bOrder;\n\
-    result = convert_int4_rte(tmpDst * outputScale + zp);\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\
-    temp1 = fx * tmpData0 + tmpData1;\n\
-    VXC_DP4x4(tmpData0, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\
-    VXC_DP4x4(tmpData1, G, G, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\
-    temp2 = fx * tmpData0 + tmpData1;\n\
-    result = fy * temp2 + (temp1 << 10);\n\
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
-    tmpDst = (tmpDst - gMean) * var;\n\
-    dstPos.z = 1;\n\
-    result = convert_int4_rte(tmpDst * outputScale + zp);\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp1st_4x4);\n\
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp2nd_4x4);\n\
-    temp1 = fx * tmpData0 + tmpData1;\n\
-    VXC_DP4x4(tmpData0, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp3rd_4x4);\n\
-    VXC_DP4x4(tmpData1, R, R, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniBilinearTmp4th_4x4);\n\
-    temp2 = fx * tmpData0 + tmpData1;\n\
-    result = fy * temp2 + (temp1 << 10);\n\
-    VXC_DP4x4(tmpDst, result, tmpV, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniDescaleU8_4x4);\n\
-    tmpDst = (tmpDst - rMean) * var;\n\
-    dstPos.z = rOrder;\n\
-    result = convert_int4_rte(tmpDst * outputScale + zp);\n\
-    VXC_DP2x8(dst, result, result, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniConvertInt32toUint8_2x8);\n\
-    VXC_WriteImage2DArray(output, dstPos, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
-}"; /* end of pre_process_yuv420_scale_u8_vx*/
+YUV422_SH_IMPL(U8toU8,  vxc_uchar4, int4,  vxc_uchar4, 4)\n\
+YUV422_SH_IMPL(U8toI8,  vxc_char4,  int4,  vxc_char4,  4)\n\
+YUV422_SH_IMPL(U8toI16, vxc_short4, int4,  vxc_short4, 8)\n\
+YUV422_SH_IMPL(U8toF16, vxc_half4,  half4, vxc_short4, 8)\n\
+"; /* end of pre_process_yuv422_scale_vx*/
 
 static const char pre_process_yuv444_copy_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -40004,15 +39784,15 @@ _viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\
     vxc_ushort8 mp0, mp1; \\\n\
     _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\
     _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
-    read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+    read_fun(src0, input0, coord, 0, \\\n\
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-    read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+    read_fun(src1, input1, coord, 0, \\\n\
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
     VXC_DP2x8(src0, src0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
              uniU8MulAndPostShift0_Lo_2x8); \\\n\
     VXC_DP2x8(src1, src1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
              uniU8MulAndPostShift1_Lo_2x8); \\\n\
-    read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+    read_fun(value_tmp, condition, coord, 0, \\\n\
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
     VXC_DP2x8(value, value_tmp, value_tmp,\\\n\
              VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \\\n\
@@ -40052,11 +39832,11 @@ SELECT_INT_FUN_2D(I8, I16, I16, vxc_short8)\n\
 #define SELECT_HALF(read_fun, write_fun) \\\n\
     vxc_short8 src0, src1, dst, value; \\\n\
     vxc_char8 value_tmp; \\\n\
-    read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+    read_fun(src0, input0, coord, 0, \\\n\
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-    read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+    read_fun(src1, input1, coord, 0, \\\n\
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-    read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+    read_fun(value_tmp, condition, coord, 0, \\\n\
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
     VXC_DP2x8(value, value_tmp, value_tmp,\\\n\
              VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \\\n\
@@ -40083,37 +39863,36 @@ __kernel void select_I8_F16_F16toF16_2D(\n\
     SELECT_HALF(VXC_ReadImage, VXC_WriteImage)\n\
 }\n\
 \n\
-#define SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, read_fun, write_fun) \\\n\
-    vxc_short8 src0, src1, dst, value; \\\n\
-    vxc_half8 value0, value1; \\\n\
-    src0_type r0; \\\n\
-    src1_type r1; \\\n\
+#define SELECT_HYBRID(src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type, read_fun, write_fun) \\\n\
+    save_type dst, value; \\\n\
+    save_type dst0, dst1; \\\n\
+    dst_type value0, value1; \\\n\
+    src0_type src0; \\\n\
+    src1_type src1; \\\n\
     copy0_type v0; \\\n\
     copy1_type v1; \\\n\
     vxc_char8 value_tmp; \\\n\
     vxc_ushort8 mp0, mp1; \\\n\
     _viv_asm(COPY, mp0, multAndoutZP0, 16); \\\n\
     _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
-    read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    read_fun(src0, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
     _viv_asm(COPY, v0, src0, 16); \\\n\
-    read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
-                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    read_fun(src1, input1, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
     _viv_asm(COPY, v1, src1, 16); \\\n\
     VXC_DP2x8(value0, v0, mp0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
              uniU8MulAndPostShift0_Lo_2x8); \\\n\
-    _viv_asm(COPY, src0, value0, 16); \\\n\
+    _viv_asm(COPY, dst0, value0, 16); \\\n\
     VXC_DP2x8(value1, v1, mp1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
              uniU8MulAndPostShift1_Lo_2x8); \\\n\
-    _viv_asm(COPY, src1, value1, 16); \\\n\
-    read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+    _viv_asm(COPY, dst1, value1, 16); \\\n\
+    read_fun(value_tmp, condition, coord, 0, \\\n\
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
     VXC_DP2x8(value, value_tmp, value_tmp,\\\n\
              VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \\\n\
-    dst = (value != 0 ? src0 : src1); \\\n\
+    dst = (value != 0 ? dst0 : dst1); \\\n\
     write_fun(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 \n\
-#define SELECT_HYBRID_TOF16_FUN(name, src0_type, copy0_type, src1_type, copy1_type) \\\n\
+#define SELECT_HYBRID_FUN(name, src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type) \\\n\
 __kernel void select_##name( \\\n\
     __read_only  image2d_array_t   condition, \\\n\
     __read_only  image2d_array_t   input0, \\\n\
@@ -40121,44 +39900,62 @@ __kernel void select_##name( \\\n\
     __write_only image2d_array_t   output) \\\n\
 { \\\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
-    SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, \\\n\
+    SELECT_HYBRID(src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type,\\\n\
             VXC_ReadImage2DArray, VXC_WriteImage2DArray) \\\n\
 }\n\
-SELECT_HYBRID_TOF16_FUN(I8_F16_U8toF16,  vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16)\n\
-SELECT_HYBRID_TOF16_FUN(I8_U8_F16toF16,  vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8)\n\
-SELECT_HYBRID_TOF16_FUN(I8_F16_I8toF16,  vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16)\n\
-SELECT_HYBRID_TOF16_FUN(I8_I8_F16toF16,  vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8)\n\
-SELECT_HYBRID_TOF16_FUN(I8_F16_I16toF16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8)\n\
-SELECT_HYBRID_TOF16_FUN(I8_I16_F16toF16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8)\n\
+SELECT_HYBRID_FUN(I8_F16_U8toF16,  vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, vxc_half8,  vxc_short8)\n\
+SELECT_HYBRID_FUN(I8_U8_F16toF16,  vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8,   vxc_half8,  vxc_short8)\n\
+SELECT_HYBRID_FUN(I8_F16_I8toF16,  vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  vxc_half8,  vxc_short8)\n\
+SELECT_HYBRID_FUN(I8_I8_F16toF16,  vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8,   vxc_half8,  vxc_short8)\n\
+SELECT_HYBRID_FUN(I8_F16_I16toF16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  vxc_half8,  vxc_short8)\n\
+SELECT_HYBRID_FUN(I8_I16_F16toF16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,   vxc_half8,  vxc_short8)\n\
+SELECT_HYBRID_FUN(I8_F16_U8toU8,   vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, vxc_uchar8, vxc_uchar8)\n\
+SELECT_HYBRID_FUN(I8_U8_F16toU8,   vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8,   vxc_uchar8, vxc_uchar8)\n\
+SELECT_HYBRID_FUN(I8_F16_I8toI8,   vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  vxc_char8,  vxc_char8)\n\
+SELECT_HYBRID_FUN(I8_I8_F16toI8,   vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8,   vxc_char8,  vxc_char8)\n\
+SELECT_HYBRID_FUN(I8_F16_I16toI16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  vxc_short8, vxc_short8)\n\
+SELECT_HYBRID_FUN(I8_I16_F16toI16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,   vxc_short8, vxc_short8)\n\
+SELECT_HYBRID_FUN(I8_U8_U8toF16,   vxc_uchar8,  vxc_uchar8,  vxc_uchar8,  vxc_uchar8,  vxc_half8,  vxc_short8)\n\
+SELECT_HYBRID_FUN(I8_I8_I8toF16,   vxc_char8,   vxc_char8,   vxc_char8,   vxc_char8,   vxc_half8,  vxc_short8)\n\
+SELECT_HYBRID_FUN(I8_I16_I16toF16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,  vxc_short8)\n\
 \n\
-#define SELECT_HYBRID_TOF16_FUN_2D(name, src0_type, copy0_type, src1_type, copy1_type) \\\n\
-__kernel void select_##name( \\\n\
+#define SELECT_HYBRID_FUN_2D(name, src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type) \\\n\
+__kernel void select_##name##_2D( \\\n\
     __read_only  image2d_array_t   condition, \\\n\
     __read_only  image2d_array_t   input0, \\\n\
     __read_only  image2d_array_t   input1, \\\n\
     __write_only image2d_array_t   output) \\\n\
 { \\\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
-    SELECT_HYBRIDTOF16(src0_type, copy0_type, src1_type, copy1_type, \\\n\
+    SELECT_HYBRID(src0_type, copy0_type, src1_type, copy1_type, dst_type, save_type, \\\n\
             VXC_ReadImage, VXC_WriteImage) \\\n\
 }\n\
-SELECT_HYBRID_TOF16_FUN_2D(I8_F16_U8toF16_2D,  vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16)\n\
-SELECT_HYBRID_TOF16_FUN_2D(I8_U8_F16toF16_2D,  vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8)\n\
-SELECT_HYBRID_TOF16_FUN_2D(I8_F16_I8toF16_2D,  vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16)\n\
-SELECT_HYBRID_TOF16_FUN_2D(I8_I8_F16toF16_2D,  vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8)\n\
-SELECT_HYBRID_TOF16_FUN_2D(I8_F16_I16toF16_2D, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8)\n\
-SELECT_HYBRID_TOF16_FUN_2D(I8_I16_F16toF16_2D, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8)\n\
+SELECT_HYBRID_FUN_2D(I8_F16_U8toF16,  vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, vxc_half8, vxc_short8)\n\
+SELECT_HYBRID_FUN_2D(I8_U8_F16toF16,  vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8,   vxc_half8, vxc_short8)\n\
+SELECT_HYBRID_FUN_2D(I8_F16_I8toF16,  vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  vxc_half8, vxc_short8)\n\
+SELECT_HYBRID_FUN_2D(I8_I8_F16toF16,  vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8,   vxc_half8, vxc_short8)\n\
+SELECT_HYBRID_FUN_2D(I8_F16_I16toF16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  vxc_half8, vxc_short8)\n\
+SELECT_HYBRID_FUN_2D(I8_I16_F16toF16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,   vxc_half8, vxc_short8)\n\
+SELECT_HYBRID_FUN_2D(I8_F16_U8toU8,   vxc_short8,  vxc_half8,   vxc_uchar16, vxc_uchar16, vxc_uchar8, vxc_uchar8)\n\
+SELECT_HYBRID_FUN_2D(I8_U8_F16toU8,   vxc_uchar16, vxc_uchar16, vxc_short8,  vxc_half8,   vxc_uchar8, vxc_uchar8)\n\
+SELECT_HYBRID_FUN_2D(I8_F16_I8toI8,   vxc_short8,  vxc_half8,   vxc_char16,  vxc_char16,  vxc_char8,  vxc_char8)\n\
+SELECT_HYBRID_FUN_2D(I8_I8_F16toI8,   vxc_char16,  vxc_char16,  vxc_short8,  vxc_half8,   vxc_char8,  vxc_char8)\n\
+SELECT_HYBRID_FUN_2D(I8_F16_I16toI16, vxc_short8,  vxc_half8,   vxc_short8,  vxc_short8,  vxc_short8, vxc_short8)\n\
+SELECT_HYBRID_FUN_2D(I8_I16_F16toI16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,   vxc_short8, vxc_short8)\n\
+SELECT_HYBRID_FUN_2D(I8_U8_U8toF16,   vxc_uchar8,  vxc_uchar8,  vxc_uchar8,  vxc_uchar8,  vxc_half8,  vxc_short8)\n\
+SELECT_HYBRID_FUN_2D(I8_I8_I8toF16,   vxc_char8,   vxc_char8,   vxc_char8,   vxc_char8,   vxc_half8,  vxc_short8)\n\
+SELECT_HYBRID_FUN_2D(I8_I16_I16toF16, vxc_short8,  vxc_short8,  vxc_short8,  vxc_short8,  vxc_half8,  vxc_short8)\n\
 \n\
 #define SELECT_HALF_TO_QINT(read_fun, write_fun, dst_type) \\\n\
     vxc_short8 src0, src1, tmp_dst, value; \\\n\
     vxc_half8 data; \\\n\
     dst_type dst; \\\n\
     vxc_char8 value_tmp; \\\n\
-    read_fun(src0, input0, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+    read_fun(src0, input0, coord, 0, \\\n\
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-    read_fun(src1, input1, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+    read_fun(src1, input1, coord, 0, \\\n\
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-    read_fun(value_tmp, condition, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\
+    read_fun(value_tmp, condition, coord, 0, \\\n\
                 VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
     VXC_DP2x8(value, value_tmp, value_tmp,\\\n\
              VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvConditiontoDst_2x8); \\\n\
@@ -44100,6 +43897,289 @@ __kernel void batch_norm_I32to##TYPE##_2D \\\n\
 BATCH_NORM_I32_SH_IMPL_2D(I32)\n\
 BATCH_NORM_I32_SH_IMPL_2D(F32)"; /* end of batchnorm_single_cl*/
 
+static const char bucketize_cl[] = "#pragma OPENCL EXTENSION CL_VIV_asm : enable\n\
+\n\
+#define BUCKETIZE_F32_2D_SH_IMPL(name, comp_op) \\\n\
+__kernel void bucketize_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_t input, \\\n\
+    __read_only  image2d_t boundaries, \\\n\
+    __write_only image2d_t output, \\\n\
+                 int       boundaries_size, \\\n\
+                 float     input0_scale, \\\n\
+                 float     input0_tail, \\\n\
+                 float     input1_scale, \\\n\
+                 float     input1_tail \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    float4 src0 = read_imagef(input, coord); \\\n\
+ \\\n\
+    int2 pos = 0; \\\n\
+    do \\\n\
+    { \\\n\
+        float4 src1 = read_imagef(boundaries, pos); \\\n\
+        if ((src0.x) comp_op (src1.x)) \\\n\
+        { \\\n\
+            break; \\\n\
+        } \\\n\
+        pos.x ++; \\\n\
+    } while(pos.x < boundaries_size); \\\n\
+ \\\n\
+    write_imagei(output, coord, pos.xxxx); \\\n\
+}\n\
+BUCKETIZE_F32_2D_SH_IMPL(F32_F32toI32_2D,       <=)\n\
+BUCKETIZE_F32_2D_SH_IMPL(right_F32_F32toI32_2D, <)\n\
+\n\
+#define BUCKETIZE_F32_SH_IMPL(name, comp_op) \\\n\
+__kernel void bucketize_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       boundaries, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int             boundaries_size, \\\n\
+                 float           input0_scale, \\\n\
+                 float           input0_tail, \\\n\
+                 float           input1_scale, \\\n\
+                 float           input1_tail \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    float4 src0 = read_imagef(input, coord); \\\n\
+ \\\n\
+    int2 pos = 0; \\\n\
+    do \\\n\
+    { \\\n\
+        float4 src1 = read_imagef(boundaries, pos); \\\n\
+        if ((src0.x) comp_op (src1.x)) \\\n\
+        { \\\n\
+            break; \\\n\
+        } \\\n\
+        pos.x ++; \\\n\
+    } while(pos.x < boundaries_size); \\\n\
+ \\\n\
+    write_imagei(output, coord, pos.xxxx); \\\n\
+}\n\
+BUCKETIZE_F32_SH_IMPL(F32_F32toI32,       <=)\n\
+BUCKETIZE_F32_SH_IMPL(right_F32_F32toI32, <)\n\
+\n\
+#define BUCKETIZE_I32_2D_SH_IMPL(name, comp_op) \\\n\
+__kernel void bucketize_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_t input, \\\n\
+    __read_only  image2d_t boundaries, \\\n\
+    __write_only image2d_t output, \\\n\
+                 int       boundaries_size, \\\n\
+                 float     input0_scale, \\\n\
+                 float     input0_tail, \\\n\
+                 float     input1_scale, \\\n\
+                 float     input1_tail \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    float4 src0 = convert_float4(read_imagei(input, coord)); \\\n\
+ \\\n\
+    int2 pos = 0; \\\n\
+    src0 = src0 * input0_scale + input0_tail; \\\n\
+    do \\\n\
+    { \\\n\
+        float4 src1 = convert_float4(read_imagei(boundaries, pos)); \\\n\
+        src1 = src1 * input1_scale + input1_tail; \\\n\
+        if ((src0.x) comp_op (src1.x)) \\\n\
+        { \\\n\
+            break; \\\n\
+        } \\\n\
+        pos.x ++; \\\n\
+    } while(pos.x < boundaries_size); \\\n\
+ \\\n\
+    write_imagei(output, coord, pos.xxxx); \\\n\
+}\n\
+BUCKETIZE_I32_2D_SH_IMPL(I32_I32toI32_2D,       <=)\n\
+BUCKETIZE_I32_2D_SH_IMPL(right_I32_I32toI32_2D, <)\n\
+\n\
+#define BUCKETIZE_I32_SH_IMPL(name, comp_op) \\\n\
+__kernel void bucketize_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       boundaries, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int             boundaries_size, \\\n\
+                 float           input0_scale, \\\n\
+                 float           input0_tail, \\\n\
+                 float           input1_scale, \\\n\
+                 float           input1_tail \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    int4 data = read_imagei(input, coord); \\\n\
+    float4 src0 = convert_float4(data) * input0_scale + input0_tail; \\\n\
+ \\\n\
+    int2 pos = 0; \\\n\
+    do \\\n\
+    { \\\n\
+        float4 src1 = convert_float4(read_imagei(boundaries, pos)); \\\n\
+        src1 = src1 * input1_scale + input1_tail; \\\n\
+        if ((src0.x) comp_op (src1.x)) \\\n\
+        { \\\n\
+            break; \\\n\
+        } \\\n\
+        pos.x ++; \\\n\
+    } while(pos.x < boundaries_size); \\\n\
+ \\\n\
+    write_imagei(output, coord, pos.xxxx); \\\n\
+}\n\
+BUCKETIZE_I32_SH_IMPL(I32_I32toI32,       <=)\n\
+BUCKETIZE_I32_SH_IMPL(right_I32_I32toI32, <)\n\
+\n\
+#define BUCKETIZE_U32_2D_SH_IMPL(name, comp_op) \\\n\
+__kernel void bucketize_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_t input, \\\n\
+    __read_only  image2d_t boundaries, \\\n\
+    __write_only image2d_t output, \\\n\
+                 int       boundaries_size, \\\n\
+                 float     input0_scale, \\\n\
+                 float     input0_tail, \\\n\
+                 float     input1_scale, \\\n\
+                 float     input1_tail \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    float4 src0 = convert_float4(read_imageui(input, coord)); \\\n\
+ \\\n\
+    int2 pos = 0; \\\n\
+    src0 = src0 * input0_scale + input0_tail; \\\n\
+    do \\\n\
+    { \\\n\
+        float4 src1 = convert_float4(read_imageui(boundaries, pos)); \\\n\
+        src1 = src1 * input1_scale + input1_tail; \\\n\
+        if ((src0.x) comp_op (src1.x)) \\\n\
+        { \\\n\
+            break; \\\n\
+        } \\\n\
+        pos.x ++; \\\n\
+    } while(pos.x < boundaries_size); \\\n\
+ \\\n\
+    write_imagei(output, coord, pos.xxxx); \\\n\
+}\n\
+BUCKETIZE_U32_2D_SH_IMPL(U32_U32toI32_2D,       <=)\n\
+BUCKETIZE_U32_2D_SH_IMPL(right_U32_U32toI32_2D, <)\n\
+\n\
+#define BUCKETIZE_U32_SH_IMPL(name, comp_op) \\\n\
+__kernel void bucketize_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       boundaries, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int             boundaries_size, \\\n\
+                 float           input0_scale, \\\n\
+                 float           input0_tail, \\\n\
+                 float           input1_scale, \\\n\
+                 float           input1_tail \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    uint4 data = read_imageui(input, coord); \\\n\
+    float4 src0 = convert_float4(data) * input0_scale + input0_tail; \\\n\
+ \\\n\
+    int2 pos = 0; \\\n\
+    do \\\n\
+    { \\\n\
+        float4 src1 = convert_float4(read_imageui(boundaries, pos)); \\\n\
+        src1 = src1 * input1_scale + input1_tail; \\\n\
+        if ((src0.x) comp_op (src1.x)) \\\n\
+        { \\\n\
+            break; \\\n\
+        } \\\n\
+        pos.x ++; \\\n\
+    } while(pos.x < boundaries_size); \\\n\
+ \\\n\
+    write_imagei(output, coord, pos.xxxx); \\\n\
+}\n\
+BUCKETIZE_U32_SH_IMPL(U32_U32toI32,       <=)\n\
+BUCKETIZE_U32_SH_IMPL(right_U32_U32toI32, <)\n\
+\n\
+#define BUCKETIZE_BF16_2D_SH_IMPL(name, comp_op) \\\n\
+__kernel void bucketize_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_t input, \\\n\
+    __read_only  image2d_t boundaries, \\\n\
+    __write_only image2d_t output, \\\n\
+                 int       boundaries_size, \\\n\
+                 float     input0_scale, \\\n\
+                 float     input0_tail, \\\n\
+                 float     input1_scale, \\\n\
+                 float     input1_tail \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    uint4 data0 = read_imageui(input, coord) << 16; \\\n\
+    float4 src0; \\\n\
+    _viv_asm(COPY, src0, data0, 16); \\\n\
+ \\\n\
+    int2 pos = 0; \\\n\
+    do \\\n\
+    { \\\n\
+        uint4 data1 = read_imageui(boundaries, pos) << 16; \\\n\
+        float4 src1; \\\n\
+        _viv_asm(COPY, src1, data1, 16); \\\n\
+        if ((src0.x) comp_op (src1.x)) \\\n\
+        { \\\n\
+            break; \\\n\
+        } \\\n\
+        pos.x ++; \\\n\
+    } while(pos.x < boundaries_size); \\\n\
+ \\\n\
+    write_imagei(output, coord, pos.xxxx); \\\n\
+}\n\
+BUCKETIZE_BF16_2D_SH_IMPL(BF16_BF16toI32_2D,       <=)\n\
+BUCKETIZE_BF16_2D_SH_IMPL(right_BF16_BF16toI32_2D, <)\n\
+\n\
+#define BUCKETIZE_BF16_SH_IMPL(name, comp_op) \\\n\
+__kernel void bucketize_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t input, \\\n\
+    __read_only  image2d_t       boundaries, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int             boundaries_size, \\\n\
+                 float           input0_scale, \\\n\
+                 float           input0_tail, \\\n\
+                 float           input1_scale, \\\n\
+                 float           input1_tail \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    uint4 data0 = read_imageui(input, coord) << 16; \\\n\
+    float4 src0; \\\n\
+    _viv_asm(COPY, src0, data0, 16); \\\n\
+ \\\n\
+    int2 pos = 0; \\\n\
+    do \\\n\
+    { \\\n\
+        uint4 data1 = read_imageui(boundaries, pos) << 16; \\\n\
+        float4 src1; \\\n\
+        _viv_asm(COPY, src1, data1, 16); \\\n\
+        if ((src0.x) comp_op (src1.x)) \\\n\
+        { \\\n\
+            break; \\\n\
+        } \\\n\
+        pos.x ++; \\\n\
+    } while(pos.x < boundaries_size); \\\n\
+ \\\n\
+    write_imagei(output, coord, pos.xxxx); \\\n\
+}\n\
+BUCKETIZE_BF16_SH_IMPL(BF16_BF16toI32,       <=)\n\
+BUCKETIZE_BF16_SH_IMPL(right_BF16_BF16toI32, <)\n\
+"; /* end of bucketize_cl*/
+
 static const char cast_cl[] = "\n\
 #define CAST_FUN(src_name, dst_name, src_type, dst_type, conv_fun, read_fun, write_fun) \\\n\
 __kernel void cast_##src_name##to##dst_name( \\\n\
@@ -50206,6 +50286,123 @@ TENSORLOGICAL_2D(and, &&, )\n\
 TENSORLOGICAL_2D(xor, ^,  !!)\n\
 "; /* end of logical_ops_cl*/
 
+static const char lppool_cl[] = "\n\
+#define LPPOOL_PROCESS(src_type, dst_type, readimage_type, conv_mode, writeimage_type) \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+    int hstart = gidy * stride_y - pad_top; \\\n\
+    int wstart = gidx * stride_x - pad_left; \\\n\
+    int hend = min(hstart + ksize_y, height); \\\n\
+    int wend = min(wstart + ksize_x, width); \\\n\
+    int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0); \\\n\
+    int4 coord_in  = coord_out; \\\n\
+    int h, w; \\\n\
+    float sum_of_pow = 0; \\\n\
+    dst_type out_data = (dst_type)(0); \\\n\
+    src_type in_data; \\\n\
+    float in_f32, out_f32; \\\n\
+    hstart = max(hstart, 0); \\\n\
+    wstart = max(wstart, 0); \\\n\
+    for (h = hstart; h < hend; h++) \\\n\
+    { \\\n\
+        for (w = wstart; w < wend; w++) \\\n\
+        { \\\n\
+            coord_in.xy = (int2)(w, h); \\\n\
+            in_data = readimage_type(input, coord_in).x; \\\n\
+            in_f32 = convert_float(in_data) * inputScale + inputTail; \\\n\
+            sum_of_pow += pow(fabs(in_f32),p); \\\n\
+        } \\\n\
+    } \\\n\
+    out_f32 = pow(sum_of_pow, 1.0f / p) * outputScale + outputTail; \\\n\
+    out_data.x = conv_mode(out_f32); \\\n\
+    writeimage_type(output, coord_out, out_data); \\\n\
+\n\
+#define TENSOR_LPPOOL(src_name, dst_name, src_type, dst_type, readimage_type, conv_mode, writeimage_type) \\\n\
+__kernel void lppool_##src_name##to##dst_name ( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 int              ksize_x, \\\n\
+                 int              ksize_y, \\\n\
+                 int              stride_x, \\\n\
+                 int              stride_y, \\\n\
+                 int              pad_left, \\\n\
+                 int              pad_top, \\\n\
+                 int              p, \\\n\
+                 int              width, \\\n\
+                 int              height, \\\n\
+                 float            inputScale, \\\n\
+                 float            inputTail, \\\n\
+                 float            outputScale, \\\n\
+                 float            outputTail) \\\n\
+{ \\\n\
+    LPPOOL_PROCESS(src_type, dst_type, readimage_type, conv_mode, writeimage_type); \\\n\
+}\n\
+\n\
+TENSOR_LPPOOL(F32, F32, float, float4, read_imagef, convert_float, write_imagef)\n\
+TENSOR_LPPOOL(F32, U32, float, uint4,  read_imagef, convert_uint,  write_imageui)\n\
+TENSOR_LPPOOL(F32, I32, float, int4,   read_imagef, convert_int,   write_imagei)\n\
+\n\
+TENSOR_LPPOOL(U32, U32, uint, uint4,  read_imageui, convert_uint,  write_imageui)\n\
+TENSOR_LPPOOL(U32, F32, uint, float4, read_imageui, convert_float, write_imagef)\n\
+TENSOR_LPPOOL(U32, I32, uint, int4,   read_imageui, convert_int,   write_imagei)\n\
+\n\
+TENSOR_LPPOOL(I32, I32, int, int4,    read_imagei, convert_int,   write_imagei)\n\
+TENSOR_LPPOOL(I32, F32, int, float4, read_imagei, convert_float, write_imagef)\n\
+TENSOR_LPPOOL(I32, U32, int, uint4,  read_imagei, convert_uint,  write_imageui)\n\
+\n\
+__kernel void lppool_BF16toBF16(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+                 int              ksize_x,\n\
+                 int              ksize_y,\n\
+                 int              stride_x,\n\
+                 int              stride_y,\n\
+                 int              pad_left,\n\
+                 int              pad_top,\n\
+                 int              p,\n\
+                 int              width,\n\
+                 int              height,\n\
+                 float            inputScale,\n\
+                 float            inputTail,\n\
+                 float            outputScale,\n\
+                 float            outputTail)\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+    int hstart = gidy * stride_y - pad_top;\n\
+    int wstart = gidx * stride_x - pad_left;\n\
+    int hend = min(hstart + ksize_y, height);\n\
+    int wend = min(wstart + ksize_x, width);\n\
+    int4 coord_out = (int4)(gidx, gidy, get_global_id(2), 0);\n\
+    int4 coord_in  = coord_out;\n\
+    int h, w;\n\
+    float sum_of_pow = 0;\n\
+    float out_data_f32 = 0;\n\
+    uint4 dst = (uint4)(0);\n\
+    float4 data_f32 = (float4)(0);\n\
+    uint4 data;\n\
+    hstart = max(hstart, 0);\n\
+    wstart = max(wstart, 0);\n\
+\n\
+    for (h = hstart; h < hend; h++)\n\
+    {\n\
+        for (w = wstart; w < wend; w++)\n\
+        {\n\
+            coord_in.xy = (int2)(w, h);\n\
+            data = read_imageui(input, coord_in);\n\
+            data = data << 16;\n\
+            _viv_asm(COPY, data_f32, data, 16);\n\
+            sum_of_pow += pow(abs(data_f32.x),p);\n\
+        }\n\
+    }\n\
+    out_data_f32 = pow(sum_of_pow, 1.0f / p);\n\
+    _viv_asm(COPY, dst, out_data_f32, 4);\n\
+    dst.x = dst.x >> 16;\n\
+    write_imageui(output, coord_out, dst);\n\
+}\n\
+\n\
+"; /* end of lppool_cl*/
+
 static const char lstmunit_activation_BP_F32_cl[] = "float4 sigmoid(float4 x, float logE)\n\
 {\n\
     x *= -logE;\n\
@@ -53543,7 +53740,7 @@ __kernel void maximum_I32I32toI32\n\
     float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\
     float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\
     float4 data = data0 > data1 ? data0 : data1;\n\
-    int4 dst = convert_int4(data * outputScale + outputZP);\n\
+    int4 dst = convert_int4_rte(data * outputScale + outputZP);\n\
 \n\
     write_imagei(output, coord, dst);\n\
 }\n\
@@ -53569,7 +53766,7 @@ __kernel void maximum_I32I32toI32_2D\n\
     float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\
     float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\
     float4 data = data0 > data1 ? data0 : data1;\n\
-    int4 dst = convert_int4(data * outputScale + outputZP);\n\
+    int4 dst = convert_int4_rte(data * outputScale + outputZP);\n\
 \n\
     write_imagei(output, coord, dst);\n\
 }\n\
@@ -54086,7 +54283,7 @@ __kernel void minimum_I32I32toI32\n\
     float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\
     float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\
     float4 data = data0 < data1 ? data0 : data1;\n\
-    int4 dst = convert_int4(data * outputScale + outputZP);\n\
+    int4 dst = convert_int4_rte(data * outputScale + outputZP);\n\
 \n\
     write_imagei(output, coord, dst);\n\
 }\n\
@@ -54112,7 +54309,7 @@ __kernel void minimum_I32I32toI32_2D\n\
     float4 data0 = convert_float4(src0) * input0Scale - input0Tail;\n\
     float4 data1 = convert_float4(src1) * input1Scale - input1Tail;\n\
     float4 data = data0 < data1 ? data0 : data1;\n\
-    int4 dst = convert_int4(data * outputScale + outputZP);\n\
+    int4 dst = convert_int4_rte(data * outputScale + outputZP);\n\
 \n\
     write_imagei(output, coord, dst);\n\
 }\n\
@@ -58290,15 +58487,17 @@ __kernel void resize_nearest_U8toU8(\n\
 }\n\
 "; /* end of resize_nearest_cl*/
 
-static const char roi_align_cl[] = "inline float roi_align_1x1\n\
+static const char roi_align_cl[] = "\n\
+inline float roi_align_1x1\n\
 (\n\
     __read_only  image2d_array_t  input,\n\
-                           float2 region_start,\n\
-                           float2 region_end,\n\
-                           float2 bin_size,\n\
-                           int2   grid_size,\n\
-                           float2 rcp_of_grid_size,\n\
-                           int    pz\n\
+                 float2 region_start,\n\
+                 float2 region_end,\n\
+                 float2 bin_size,\n\
+                 int2   grid_size,\n\
+                 float2 rcp_of_grid_size,\n\
+                 int    pz,\n\
+                 int4   max_spatial_dims\n\
 )\n\
 {\n\
     float sum = 0;\n\
@@ -58313,15 +58512,24 @@ static const char roi_align_cl[] = "inline float roi_align_1x1\n\
             int2 xy_low  = convert_int2(pos);\n\
             int2 xy_high = xy_low + 1;\n\
 \n\
-            float ly = pos.y - xy_low.y;\n\
-            float lx = pos.x - xy_low.x;\n\
-            float hy = 1.0f - ly;\n\
-            float hx = 1.0f - lx;\n\
+            if (xy_low.x > max_spatial_dims.x || max_spatial_dims.x < -1 ||\n\
+                xy_low.y > max_spatial_dims.y || max_spatial_dims.y < -1 )\n\
+            {\n\
+                continue;\n\
+            }\n\
+\n\
+            float2 lxy = pos - floor(pos);\n\
+            float2 zero = 0;\n\
+\n\
+            lxy = xy_low >= max_spatial_dims.zw ? 0.0 : lxy;\n\
+\n\
+            float hy = 1.0f - lxy.y;\n\
+            float hx = 1.0f - lxy.x;\n\
 \n\
             float w1 = hy * hx;\n\
-            float w2 = hy * lx;\n\
-            float w3 = ly * hx;\n\
-            float w4 = ly * lx;\n\
+            float w2 = lxy.x - lxy.x * lxy.y;\n\
+            float w3 = lxy.y - lxy.x * lxy.y;\n\
+            float w4 = lxy.y * lxy.x;\n\
 \n\
             float data1 = read_imagef(input, (int4)(xy_low.x, xy_low.y, pz, 0)).x;\n\
             float data2 = read_imagef(input, (int4)(xy_high.x, xy_low.y, pz, 0)).x;\n\
@@ -58335,8 +58543,9 @@ static const char roi_align_cl[] = "inline float roi_align_1x1\n\
     return (float)(sum * rcp_of_grid_size.x * rcp_of_grid_size.y);\n\
 }\n\
 \n\
-\n\
 #define EPS_GRID 0.00001f\n\
+#define TYPE_FLOAT16    (1)\n\
+#define TYPE_FLOAT32    (2)\n\
 __kernel void roi_align_F32_F32toF32\n\
 (\n\
     __read_only  image2d_array_t input,\n\
@@ -58349,13 +58558,14 @@ __kernel void roi_align_F32_F32toF32\n\
                  float           output_zp,\n\
                  float           spatial_x_scale,\n\
                  float           spatial_y_scale,\n\
-                 float           in_width,\n\
-                 float           in_height,\n\
+                 int             in_width,\n\
+                 int             in_height,\n\
                  float           rcp_of_out_width,\n\
                  float           rcp_of_out_height,\n\
                  float           sampling_x_ratio,\n\
                  float           sampling_y_ratio,\n\
-                 int             depth\n\
+                 int             depth,\n\
+                 int             dtype\n\
 )\n\
 {\n\
     int px = get_global_id(0);\n\
@@ -58374,7 +58584,10 @@ __kernel void roi_align_F32_F32toF32\n\
 \n\
     float2 spatial_indx     = (float2)(px, py);\n\
     float2 pooled_dims      = (float2)(rcp_of_out_width, rcp_of_out_height);\n\
-    float2 max_spatial_dims = (float2)(in_width, in_height);\n\
+    int4 max_spatial_dims   = (int4)(in_width, in_height, in_width, in_height);\n\
+    max_spatial_dims.zw = max_spatial_dims.zw - 1;\n\
+\n\
+    float2 max_limiatation = convert_float2(max_spatial_dims.zw);\n\
 \n\
     float2 bin_size     = roi_dims * pooled_dims;\n\
     float2 region_start = spatial_indx * bin_size + roi_anchor.xy;\n\
@@ -58397,9 +58610,28 @@ __kernel void roi_align_F32_F32toF32\n\
                        bin_size,\n\
                        grid_size_xy,\n\
                        rcp_of_grid_size,\n\
-                       kz);\n\
+                       kz,\n\
+                       max_spatial_dims);\n\
 \n\
-        write_imagef(output, (int4)(px, py, kz1, 0), interp);\n\
+        if (dtype == TYPE_FLOAT16)\n\
+        {\n\
+            half tmp;\n\
+            short dst;\n\
+            _viv_asm(CONV, tmp, interp.x);\n\
+            _viv_asm(COPY, dst, tmp, 2);\n\
+\n\
+            Tensor out_t =  create_tensor_from_image2d_array(output, 2);\n\
+            short *output_ptr = (short *)get_tensor_ptr_from_coord(out_t, (int4)(px, py, kz1, 0));\n\
+\n\
+            output_ptr[0] = dst;\n\
+        }\n\
+        else\n\
+        {\n\
+            Tensor out_t =  create_tensor_from_image2d_array(output, 4);\n\
+            float *output_ptr = (float *)get_tensor_ptr_from_coord(out_t, (int4)(px, py, kz1, 0));\n\
+\n\
+            output_ptr[0] = interp.x;\n\
+        }\n\
     }\n\
 }\n\
 \n\
@@ -58413,7 +58645,8 @@ inline float roi_align_1x1_U8toF32\n\
                 float2           bin_size,\n\
                 int2             grid_size,\n\
                 float2           rcp_of_grid_size,\n\
-                int              pz\n\
+                int              pz,\n\
+                int4             max_spatial_dims\n\
 )\n\
 {\n\
     float sum = 0;\n\
@@ -58424,33 +58657,43 @@ inline float roi_align_1x1_U8toF32\n\
         {\n\
             float2 ixy = (float2)(ix + 0.5f, iy + 0.5f);\n\
             float2 pos = region_start + ixy * bin_size * rcp_of_grid_size;\n\
-\n\
+    \n\
             int2 xy_low  = convert_int2(pos);\n\
             int2 xy_high = xy_low + 1;\n\
-\n\
-            float ly = pos.y - xy_low.y;\n\
-            float lx = pos.x - xy_low.x;\n\
-            float hy = 1.0f - ly;\n\
-            float hx = 1.0f - lx;\n\
-\n\
+    \n\
+            float2 lxy = pos - floor(pos);\n\
+            float2 zero = 0;\n\
+    \n\
+            if (xy_low.x > max_spatial_dims.x || max_spatial_dims.x < -1 ||\n\
+                xy_low.y > max_spatial_dims.y || max_spatial_dims.y < -1 )\n\
+            {\n\
+                continue;\n\
+            }\n\
+    \n\
+            lxy = xy_low >= max_spatial_dims.zw ? 0.0 : lxy;\n\
+    \n\
+            float hy = 1.0f - lxy.y;\n\
+            float hx = 1.0f - lxy.x;\n\
+    \n\
             float w1 = hy * hx;\n\
-            float w2 = hy * lx;\n\
-            float w3 = ly * hx;\n\
-            float w4 = ly * lx;\n\
-\n\
+            float w2 = lxy.x - lxy.x * lxy.y;\n\
+            float w3 = lxy.y - lxy.x * lxy.y;\n\
+            float w4 = lxy.y * lxy.x;\n\
+    \n\
             uint4 data;\n\
             data.x = read_imageui(input, (int4)(xy_low.x, xy_low.y, pz, 0)).x;\n\
             data.y = read_imageui(input, (int4)(xy_high.x, xy_low.y, pz, 0)).x;\n\
             data.z = read_imageui(input, (int4)(xy_low.x, xy_high.y, pz, 0)).x;\n\
             data.w = read_imageui(input, (int4)(xy_high.x, xy_high.y, pz, 0)).x;\n\
-\n\
+    \n\
             float4 value = convert_float4(data) * input_scale + input_tail;\n\
-\n\
+    \n\
             sum = sum + w1 * value.x + w2 * value.y + w3 * value.z + w4 * value.w;\n\
         }\n\
     }\n\
-\n\
+    \n\
     return (float)(sum * rcp_of_grid_size.x * rcp_of_grid_size.y);\n\
+\n\
 }\n\
 \n\
 __kernel void roi_align_U8_U16toU8\n\
@@ -58465,13 +58708,14 @@ __kernel void roi_align_U8_U16toU8\n\
                  float           output_zp,\n\
                  float           spatial_x_scale,\n\
                  float           spatial_y_scale,\n\
-                 float           in_width,\n\
-                 float           in_height,\n\
+                 int             in_width,\n\
+                 int             in_height,\n\
                  float           rcp_of_out_width,\n\
                  float           rcp_of_out_height,\n\
                  float           sampling_x_ratio,\n\
                  float           sampling_y_ratio,\n\
-                 int             depth\n\
+                 int             depth,\n\
+                 int             dtype\n\
 )\n\
 {\n\
     int px = get_global_id(0);\n\
@@ -58490,7 +58734,10 @@ __kernel void roi_align_U8_U16toU8\n\
 \n\
     float2 spatial_indx     = (float2)(px, py);\n\
     float2 pooled_dims      = (float2)(rcp_of_out_width, rcp_of_out_height);\n\
-    float2 max_spatial_dims = (float2)(in_width, in_height);\n\
+    int4 max_spatial_dims   = (int4)(in_width, in_height, in_width, in_height);\n\
+    max_spatial_dims.zw = max_spatial_dims.zw - 1;\n\
+\n\
+    float2 max_limiatation = convert_float2(max_spatial_dims.zw);\n\
 \n\
     float2 bin_size     = roi_dims * pooled_dims;\n\
     float2 region_start = spatial_indx * bin_size + roi_anchor.xy;\n\
@@ -58515,16 +58762,909 @@ __kernel void roi_align_U8_U16toU8\n\
                        bin_size,\n\
                        grid_size_xy,\n\
                        rcp_of_grid_size,\n\
-                       kz);\n\
+                       kz,\n\
+                       max_spatial_dims);\n\
 \n\
-        uint4 dst;\n\
+        uchar dst;\n\
         interp.x = interp.x * output_scale + output_zp;\n\
         interp.x = interp.x < 255 ? interp.x : 255;\n\
-        dst.x = convert_uint_rte(interp.x);\n\
-        write_imageui(output, (int4)(px, py, kz1, 0), dst.xxxx);\n\
+        dst = convert_uchar_rte(interp.x);\n\
+\n\
+        Tensor out_t =  create_tensor_from_image2d_array(output, 1);\n\
+        uchar *output_ptr = (uchar *)get_tensor_ptr_from_coord(out_t, (int4)(px, py, kz1, 0));\n\
+        \n\
+        output_ptr[0] = dst;\n\
     }\n\
 }"; /* end of roi_align_cl*/
 
+static const char scatter_elements_cl[] = "\n\
+#define SCATTER_ELEMENTS_AXIS0_32BITS_IMPL(name, dtype) \\\n\
+__kernel void scatter_elements_axis0_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_t ref, \\\n\
+    __read_only  image2d_t indices, \\\n\
+    __read_only  image2d_t update, \\\n\
+    __write_only image2d_t output, \\\n\
+                 int       axis, \\\n\
+                 int       reduction, \\\n\
+                 float     ref_scale, \\\n\
+                 float     ref_tail, \\\n\
+                 float     update_scale, \\\n\
+                 float     update_tail, \\\n\
+                 float     output_zp, \\\n\
+                 int       inner_size, \\\n\
+                 int       axis_size, \\\n\
+                 int       outer_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\
+ \\\n\
+    Image ref_i = create_image_from_image2d(ref, 4); \\\n\
+    Image update_i = create_image_from_image2d(update, 4); \\\n\
+    Image indices_i = create_image_from_image2d(indices, 4); \\\n\
+    Image output_i = create_image_from_image2d(output, 4); \\\n\
+ \\\n\
+    dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \\\n\
+    dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \\\n\
+    dtype data = ref_ptr[0]; \\\n\
+    if (coord.y < outer_size) \\\n\
+    { \\\n\
+        dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \\\n\
+        int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \\\n\
+        for(int x = 0; x < axis_size; x ++) \\\n\
+        { \\\n\
+            int offset = indices_ptr[x]; \\\n\
+            if (offset == coord.x) \\\n\
+            { \\\n\
+                data = update_ptr[x]; \\\n\
+                break; \\\n\
+            } \\\n\
+        } \\\n\
+    } \\\n\
+ \\\n\
+    output_ptr[0] = data; \\\n\
+}\n\
+SCATTER_ELEMENTS_AXIS0_32BITS_IMPL(F32_I32_F32toF32, float)\n\
+SCATTER_ELEMENTS_AXIS0_32BITS_IMPL(I32_I32_I32toI32, int)\n\
+\n\
+#define SCATTER_ELEMENTS_AXIS0_16BITS_IMPL(name, dtype, conv_func) \\\n\
+__kernel void scatter_elements_axis0_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_t ref, \\\n\
+    __read_only  image2d_t indices, \\\n\
+    __read_only  image2d_t update, \\\n\
+    __write_only image2d_t output, \\\n\
+                 int       axis, \\\n\
+                 int       reduction, \\\n\
+                 float     ref_scale, \\\n\
+                 float     ref_tail, \\\n\
+                 float     update_scale, \\\n\
+                 float     update_tail, \\\n\
+                 float     output_zp, \\\n\
+                 int       inner_size, \\\n\
+                 int       axis_size, \\\n\
+                 int       outer_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\
+ \\\n\
+    Image ref_i = create_image_from_image2d(ref, 2); \\\n\
+    Image update_i = create_image_from_image2d(update, 2); \\\n\
+    Image indices_i = create_image_from_image2d(indices, 4); \\\n\
+    Image output_i = create_image_from_image2d(output, 2); \\\n\
+ \\\n\
+    dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \\\n\
+    dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \\\n\
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\
+    if (coord.y < outer_size) \\\n\
+    { \\\n\
+        dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \\\n\
+        int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \\\n\
+        for(int x = 0; x < axis_size; x ++) \\\n\
+        { \\\n\
+            int offset = indices_ptr[x]; \\\n\
+            if (offset == coord.x) \\\n\
+            { \\\n\
+                data = conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \\\n\
+                break; \\\n\
+            } \\\n\
+        } \\\n\
+    } \\\n\
+ \\\n\
+    output_ptr[0] = data; \\\n\
+}\n\
+SCATTER_ELEMENTS_AXIS0_16BITS_IMPL(I16_I32_I16toI16,    short,  convert_short_rte)\n\
+SCATTER_ELEMENTS_AXIS0_16BITS_IMPL(F16_I32_F16toF16,    short,  convert_short)\n\
+SCATTER_ELEMENTS_AXIS0_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)\n\
+\n\
+#define SCATTER_ELEMENTS_AXIS0_8BITS_IMPL(name, dtype, conv_func) \\\n\
+__kernel void scatter_elements_axis0_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_t ref, \\\n\
+    __read_only  image2d_t indices, \\\n\
+    __read_only  image2d_t update, \\\n\
+    __write_only image2d_t output, \\\n\
+                 int       axis, \\\n\
+                 int       reduction, \\\n\
+                 float     ref_scale, \\\n\
+                 float     ref_tail, \\\n\
+                 float     update_scale, \\\n\
+                 float     update_tail, \\\n\
+                 float     output_zp, \\\n\
+                 int       inner_size, \\\n\
+                 int       axis_size, \\\n\
+                 int       outer_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\
+ \\\n\
+    Image ref_i = create_image_from_image2d(ref, 1); \\\n\
+    Image update_i = create_image_from_image2d(update, 1); \\\n\
+    Image indices_i = create_image_from_image2d(indices, 4); \\\n\
+    Image output_i = create_image_from_image2d(output, 1); \\\n\
+ \\\n\
+    dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \\\n\
+    dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \\\n\
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\
+    if (coord.y < outer_size) \\\n\
+    { \\\n\
+        dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \\\n\
+        int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \\\n\
+        for(int x = 0; x < axis_size; x ++) \\\n\
+        { \\\n\
+            int offset = indices_ptr[x]; \\\n\
+            if (offset == coord.x) \\\n\
+            { \\\n\
+                data = conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \\\n\
+                break; \\\n\
+            } \\\n\
+        } \\\n\
+    } \\\n\
+ \\\n\
+    output_ptr[0] = data; \\\n\
+}\n\
+SCATTER_ELEMENTS_AXIS0_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)\n\
+SCATTER_ELEMENTS_AXIS0_8BITS_IMPL(I8_I32_I8toI8, char,  convert_char)\n\
+\n\
+#define SCATTER_ELEMENTS_AXIS1_32BITS_IMPL(name, dtype) \\\n\
+__kernel void scatter_elements_axis1_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t ref, \\\n\
+    __read_only  image2d_array_t indices, \\\n\
+    __read_only  image2d_array_t update, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int             axis, \\\n\
+                 int             reduction, \\\n\
+                 float           ref_scale, \\\n\
+                 float           ref_tail, \\\n\
+                 float           update_scale, \\\n\
+                 float           update_tail, \\\n\
+                 float           output_zp, \\\n\
+                 int             inner_size, \\\n\
+                 int             axis_size, \\\n\
+                 int             outer_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    Tensor ref_i = create_tensor_from_image2d_array(ref, 4); \\\n\
+    Tensor update_i = create_tensor_from_image2d_array(update, 4); \\\n\
+    Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \\\n\
+    Tensor output_i = create_tensor_from_image2d_array(output, 4); \\\n\
+ \\\n\
+    dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \\\n\
+    dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \\\n\
+    dtype data = ref_ptr[0]; \\\n\
+    if (coord.x < inner_size && coord.z < outer_size) \\\n\
+    { \\\n\
+        dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \\\n\
+        int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \\\n\
+        for(int y = 0; y < axis_size; y ++) \\\n\
+        { \\\n\
+            int offset = indices_ptr[y * inner_size]; \\\n\
+            if (offset == coord.y) \\\n\
+            { \\\n\
+                data = update_ptr[y * inner_size]; \\\n\
+                break; \\\n\
+            } \\\n\
+        } \\\n\
+    } \\\n\
+ \\\n\
+    output_ptr[0] = data; \\\n\
+}\n\
+SCATTER_ELEMENTS_AXIS1_32BITS_IMPL(F32_I32_F32toF32, float)\n\
+SCATTER_ELEMENTS_AXIS1_32BITS_IMPL(I32_I32_I32toI32, int)\n\
+\n\
+#define SCATTER_ELEMENTS_AXIS1_16BITS_IMPL(name, dtype, conv_func) \\\n\
+__kernel void scatter_elements_axis1_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t ref, \\\n\
+    __read_only  image2d_array_t indices, \\\n\
+    __read_only  image2d_array_t update, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int             axis, \\\n\
+                 int             reduction, \\\n\
+                 float           ref_scale, \\\n\
+                 float           ref_tail, \\\n\
+                 float           update_scale, \\\n\
+                 float           update_tail, \\\n\
+                 float           output_zp, \\\n\
+                 int             inner_size, \\\n\
+                 int             axis_size, \\\n\
+                 int             outer_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    Tensor ref_i = create_tensor_from_image2d_array(ref, 2); \\\n\
+    Tensor update_i = create_tensor_from_image2d_array(update, 2); \\\n\
+    Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \\\n\
+    Tensor output_i = create_tensor_from_image2d_array(output, 2); \\\n\
+ \\\n\
+    dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \\\n\
+    dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \\\n\
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\
+    if (coord.x < inner_size && coord.z < outer_size) \\\n\
+    { \\\n\
+        dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \\\n\
+        int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \\\n\
+        for(int y = 0; y < axis_size; y ++) \\\n\
+        { \\\n\
+            int offset = indices_ptr[y * inner_size]; \\\n\
+            if (offset == coord.y) \\\n\
+            { \\\n\
+                data = conv_func(convert_float(update_ptr[y * inner_size]) \\\n\
+                            * update_scale + update_tail + output_zp); \\\n\
+                break; \\\n\
+            } \\\n\
+        } \\\n\
+    } \\\n\
+ \\\n\
+    output_ptr[0] = data; \\\n\
+}\n\
+SCATTER_ELEMENTS_AXIS1_16BITS_IMPL(I16_I32_I16toI16,    short,  convert_short_rte)\n\
+SCATTER_ELEMENTS_AXIS1_16BITS_IMPL(F16_I32_F16toF16,    short,  convert_short)\n\
+SCATTER_ELEMENTS_AXIS1_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)\n\
+\n\
+#define SCATTER_ELEMENTS_AXIS1_8BITS_IMPL(name, dtype, conv_func) \\\n\
+__kernel void scatter_elements_axis1_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t ref, \\\n\
+    __read_only  image2d_array_t indices, \\\n\
+    __read_only  image2d_array_t update, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int             axis, \\\n\
+                 int             reduction, \\\n\
+                 float           ref_scale, \\\n\
+                 float           ref_tail, \\\n\
+                 float           update_scale, \\\n\
+                 float           update_tail, \\\n\
+                 float           output_zp, \\\n\
+                 int             inner_size, \\\n\
+                 int             axis_size, \\\n\
+                 int             outer_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    Tensor ref_i = create_tensor_from_image2d_array(ref, 1); \\\n\
+    Tensor update_i = create_tensor_from_image2d_array(update, 1); \\\n\
+    Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \\\n\
+    Tensor output_i = create_tensor_from_image2d_array(output, 1); \\\n\
+ \\\n\
+    dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \\\n\
+    dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \\\n\
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\
+    if (coord.x < inner_size && coord.z < outer_size) \\\n\
+    { \\\n\
+        dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \\\n\
+        int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \\\n\
+        for(int y = 0; y < axis_size; y ++) \\\n\
+        { \\\n\
+            int offset = indices_ptr[y * inner_size]; \\\n\
+            if (offset == coord.y) \\\n\
+            { \\\n\
+                data = conv_func(convert_float(update_ptr[y * inner_size]) \\\n\
+                                * update_scale + update_tail + output_zp); \\\n\
+                break; \\\n\
+            } \\\n\
+        } \\\n\
+    } \\\n\
+ \\\n\
+    output_ptr[0] = data; \\\n\
+}\n\
+SCATTER_ELEMENTS_AXIS1_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)\n\
+SCATTER_ELEMENTS_AXIS1_8BITS_IMPL(I8_I32_I8toI8, char,  convert_char)\n\
+"; /* end of scatter_elements_cl*/
+
+static const char scatter_elements_add_cl[] = "\n\
+#define SE_ADD_AXIS0_32BITS_IMPL(name, dtype) \\\n\
+__kernel void scatter_elements_add_axis0_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_t ref, \\\n\
+    __read_only  image2d_t indices, \\\n\
+    __read_only  image2d_t update, \\\n\
+    __write_only image2d_t output, \\\n\
+                 int       axis, \\\n\
+                 int       reduction, \\\n\
+                 float     ref_scale, \\\n\
+                 float     ref_tail, \\\n\
+                 float     update_scale, \\\n\
+                 float     update_tail, \\\n\
+                 float     output_zp, \\\n\
+                 int       inner_size, \\\n\
+                 int       axis_size, \\\n\
+                 int       outer_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\
+ \\\n\
+    Image ref_i = create_image_from_image2d(ref, 4); \\\n\
+    Image update_i = create_image_from_image2d(update, 4); \\\n\
+    Image indices_i = create_image_from_image2d(indices, 4); \\\n\
+    Image output_i = create_image_from_image2d(output, 4); \\\n\
+ \\\n\
+    dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \\\n\
+    dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \\\n\
+    dtype data = ref_ptr[0]; \\\n\
+    if (coord.y < outer_size) \\\n\
+    { \\\n\
+        dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \\\n\
+        int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \\\n\
+        for(int x = 0; x < axis_size; x ++) \\\n\
+        { \\\n\
+            int offset = indices_ptr[x]; \\\n\
+            if (offset == coord.x) \\\n\
+            { \\\n\
+                data += update_ptr[x]; \\\n\
+            } \\\n\
+        } \\\n\
+    } \\\n\
+ \\\n\
+    output_ptr[0] = data; \\\n\
+}\n\
+SE_ADD_AXIS0_32BITS_IMPL(F32_I32_F32toF32, float)\n\
+SE_ADD_AXIS0_32BITS_IMPL(I32_I32_I32toI32, int)\n\
+\n\
+#define SE_ADD_AXIS0_16BITS_IMPL(name, dtype, conv_func) \\\n\
+__kernel void scatter_elements_add_axis0_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_t ref, \\\n\
+    __read_only  image2d_t indices, \\\n\
+    __read_only  image2d_t update, \\\n\
+    __write_only image2d_t output, \\\n\
+                 int       axis, \\\n\
+                 int       reduction, \\\n\
+                 float     ref_scale, \\\n\
+                 float     ref_tail, \\\n\
+                 float     update_scale, \\\n\
+                 float     update_tail, \\\n\
+                 float     output_zp, \\\n\
+                 int       inner_size, \\\n\
+                 int       axis_size, \\\n\
+                 int       outer_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\
+ \\\n\
+    Image ref_i = create_image_from_image2d(ref, 2); \\\n\
+    Image update_i = create_image_from_image2d(update, 2); \\\n\
+    Image indices_i = create_image_from_image2d(indices, 4); \\\n\
+    Image output_i = create_image_from_image2d(output, 2); \\\n\
+ \\\n\
+    dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \\\n\
+    dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \\\n\
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\
+    if (coord.y < outer_size) \\\n\
+    { \\\n\
+        dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \\\n\
+        int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \\\n\
+        for(int x = 0; x < axis_size; x ++) \\\n\
+        { \\\n\
+            int offset = indices_ptr[x]; \\\n\
+            if (offset == coord.x) \\\n\
+            { \\\n\
+                data += conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \\\n\
+            } \\\n\
+        } \\\n\
+    } \\\n\
+ \\\n\
+    output_ptr[0] = data; \\\n\
+}\n\
+SE_ADD_AXIS0_16BITS_IMPL(I16_I32_I16toI16,    short,  convert_short_rte)\n\
+SE_ADD_AXIS0_16BITS_IMPL(F16_I32_F16toF16,    short,  convert_short)\n\
+SE_ADD_AXIS0_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)\n\
+\n\
+#define SE_ADD_AXIS0_8BITS_IMPL(name, dtype, conv_func) \\\n\
+__kernel void scatter_elements_add_axis0_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_t ref, \\\n\
+    __read_only  image2d_t indices, \\\n\
+    __read_only  image2d_t update, \\\n\
+    __write_only image2d_t output, \\\n\
+                 int       axis, \\\n\
+                 int       reduction, \\\n\
+                 float     ref_scale, \\\n\
+                 float     ref_tail, \\\n\
+                 float     update_scale, \\\n\
+                 float     update_tail, \\\n\
+                 float     output_zp, \\\n\
+                 int       inner_size, \\\n\
+                 int       axis_size, \\\n\
+                 int       outer_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\
+ \\\n\
+    Image ref_i = create_image_from_image2d(ref, 1); \\\n\
+    Image update_i = create_image_from_image2d(update, 1); \\\n\
+    Image indices_i = create_image_from_image2d(indices, 4); \\\n\
+    Image output_i = create_image_from_image2d(output, 1); \\\n\
+ \\\n\
+    dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \\\n\
+    dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \\\n\
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\
+    if (coord.y < outer_size) \\\n\
+    { \\\n\
+        dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \\\n\
+        int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \\\n\
+        for(int x = 0; x < axis_size; x ++) \\\n\
+        { \\\n\
+            int offset = indices_ptr[x]; \\\n\
+            if (offset == coord.x) \\\n\
+            { \\\n\
+                data += conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \\\n\
+            } \\\n\
+        } \\\n\
+    } \\\n\
+ \\\n\
+    output_ptr[0] = data; \\\n\
+}\n\
+SE_ADD_AXIS0_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)\n\
+SE_ADD_AXIS0_8BITS_IMPL(I8_I32_I8toI8, char,  convert_char)\n\
+\n\
+#define SE_ADD_AXIS1_32BITS_IMPL(name, dtype) \\\n\
+__kernel void scatter_elements_add_axis1_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t ref, \\\n\
+    __read_only  image2d_array_t indices, \\\n\
+    __read_only  image2d_array_t update, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int             axis, \\\n\
+                 int             reduction, \\\n\
+                 float           ref_scale, \\\n\
+                 float           ref_tail, \\\n\
+                 float           update_scale, \\\n\
+                 float           update_tail, \\\n\
+                 float           output_zp, \\\n\
+                 int             inner_size, \\\n\
+                 int             axis_size, \\\n\
+                 int             outer_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    Tensor ref_i = create_tensor_from_image2d_array(ref, 4); \\\n\
+    Tensor update_i = create_tensor_from_image2d_array(update, 4); \\\n\
+    Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \\\n\
+    Tensor output_i = create_tensor_from_image2d_array(output, 4); \\\n\
+ \\\n\
+    dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \\\n\
+    dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \\\n\
+    dtype data = ref_ptr[0]; \\\n\
+    if (coord.x < inner_size && coord.z < outer_size) \\\n\
+    { \\\n\
+        dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \\\n\
+        int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \\\n\
+        for(int y = 0; y < axis_size; y ++) \\\n\
+        { \\\n\
+            int offset = indices_ptr[y * inner_size]; \\\n\
+            if (offset == coord.y) \\\n\
+            { \\\n\
+                data += update_ptr[y * inner_size]; \\\n\
+            } \\\n\
+        } \\\n\
+    } \\\n\
+ \\\n\
+    output_ptr[0] = data; \\\n\
+}\n\
+SE_ADD_AXIS1_32BITS_IMPL(F32_I32_F32toF32, float)\n\
+SE_ADD_AXIS1_32BITS_IMPL(I32_I32_I32toI32, int)\n\
+\n\
+#define SE_ADD_AXIS1_16BITS_IMPL(name, dtype, conv_func) \\\n\
+__kernel void scatter_elements_add_axis1_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t ref, \\\n\
+    __read_only  image2d_array_t indices, \\\n\
+    __read_only  image2d_array_t update, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int             axis, \\\n\
+                 int             reduction, \\\n\
+                 float           ref_scale, \\\n\
+                 float           ref_tail, \\\n\
+                 float           update_scale, \\\n\
+                 float           update_tail, \\\n\
+                 float           output_zp, \\\n\
+                 int             inner_size, \\\n\
+                 int             axis_size, \\\n\
+                 int             outer_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    Tensor ref_i = create_tensor_from_image2d_array(ref, 2); \\\n\
+    Tensor update_i = create_tensor_from_image2d_array(update, 2); \\\n\
+    Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \\\n\
+    Tensor output_i = create_tensor_from_image2d_array(output, 2); \\\n\
+ \\\n\
+    dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \\\n\
+    dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \\\n\
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\
+    if (coord.x < inner_size && coord.z < outer_size) \\\n\
+    { \\\n\
+        dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \\\n\
+        int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \\\n\
+        for(int y = 0; y < axis_size; y ++) \\\n\
+        { \\\n\
+            int offset = indices_ptr[y * inner_size]; \\\n\
+            if (offset == coord.y) \\\n\
+            { \\\n\
+                data += conv_func(convert_float(update_ptr[y * inner_size]) \\\n\
+                            * update_scale + update_tail + output_zp); \\\n\
+            } \\\n\
+        } \\\n\
+    } \\\n\
+ \\\n\
+    output_ptr[0] = data; \\\n\
+}\n\
+SE_ADD_AXIS1_16BITS_IMPL(I16_I32_I16toI16,    short,  convert_short_rte)\n\
+SE_ADD_AXIS1_16BITS_IMPL(F16_I32_F16toF16,    short,  convert_short)\n\
+SE_ADD_AXIS1_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)\n\
+\n\
+#define SE_ADD_AXIS1_8BITS_IMPL(name, dtype, conv_func) \\\n\
+__kernel void scatter_elements_add_axis1_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t ref, \\\n\
+    __read_only  image2d_array_t indices, \\\n\
+    __read_only  image2d_array_t update, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int             axis, \\\n\
+                 int             reduction, \\\n\
+                 float           ref_scale, \\\n\
+                 float           ref_tail, \\\n\
+                 float           update_scale, \\\n\
+                 float           update_tail, \\\n\
+                 float           output_zp, \\\n\
+                 int             inner_size, \\\n\
+                 int             axis_size, \\\n\
+                 int             outer_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    Tensor ref_i = create_tensor_from_image2d_array(ref, 1); \\\n\
+    Tensor update_i = create_tensor_from_image2d_array(update, 1); \\\n\
+    Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \\\n\
+    Tensor output_i = create_tensor_from_image2d_array(output, 1); \\\n\
+ \\\n\
+    dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \\\n\
+    dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \\\n\
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\
+    if (coord.x < inner_size && coord.z < outer_size) \\\n\
+    { \\\n\
+        dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \\\n\
+        int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \\\n\
+        for(int y = 0; y < axis_size; y ++) \\\n\
+        { \\\n\
+            int offset = indices_ptr[y * inner_size]; \\\n\
+            if (offset == coord.y) \\\n\
+            { \\\n\
+                data += conv_func(convert_float(update_ptr[y * inner_size]) \\\n\
+                                * update_scale + update_tail + output_zp); \\\n\
+            } \\\n\
+        } \\\n\
+    } \\\n\
+ \\\n\
+    output_ptr[0] = data; \\\n\
+}\n\
+SE_ADD_AXIS1_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)\n\
+SE_ADD_AXIS1_8BITS_IMPL(I8_I32_I8toI8, char,  convert_char)\n\
+"; /* end of scatter_elements_add_cl*/
+
+static const char scatter_elements_mul_cl[] = "\n\
+#define SE_MUL_AXIS0_32BITS_IMPL(name, dtype) \\\n\
+__kernel void scatter_elements_mul_axis0_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_t ref, \\\n\
+    __read_only  image2d_t indices, \\\n\
+    __read_only  image2d_t update, \\\n\
+    __write_only image2d_t output, \\\n\
+                 int       axis, \\\n\
+                 int       reduction, \\\n\
+                 float     ref_scale, \\\n\
+                 float     ref_tail, \\\n\
+                 float     update_scale, \\\n\
+                 float     update_tail, \\\n\
+                 float     output_zp, \\\n\
+                 int       inner_size, \\\n\
+                 int       axis_size, \\\n\
+                 int       outer_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\
+ \\\n\
+    Image ref_i = create_image_from_image2d(ref, 4); \\\n\
+    Image update_i = create_image_from_image2d(update, 4); \\\n\
+    Image indices_i = create_image_from_image2d(indices, 4); \\\n\
+    Image output_i = create_image_from_image2d(output, 4); \\\n\
+ \\\n\
+    dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \\\n\
+    dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \\\n\
+    dtype data = ref_ptr[0]; \\\n\
+    if (coord.y < outer_size) \\\n\
+    { \\\n\
+        dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \\\n\
+        int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \\\n\
+        for(int x = 0; x < axis_size; x ++) \\\n\
+        { \\\n\
+            int offset = indices_ptr[x]; \\\n\
+            if (offset == coord.x) \\\n\
+            { \\\n\
+                data *= update_ptr[x]; \\\n\
+            } \\\n\
+        } \\\n\
+    } \\\n\
+ \\\n\
+    output_ptr[0] = data; \\\n\
+}\n\
+SE_MUL_AXIS0_32BITS_IMPL(F32_I32_F32toF32, float)\n\
+SE_MUL_AXIS0_32BITS_IMPL(I32_I32_I32toI32, int)\n\
+\n\
+#define SE_MUL_AXIS0_16BITS_IMPL(name, dtype, conv_func) \\\n\
+__kernel void scatter_elements_mul_axis0_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_t ref, \\\n\
+    __read_only  image2d_t indices, \\\n\
+    __read_only  image2d_t update, \\\n\
+    __write_only image2d_t output, \\\n\
+                 int       axis, \\\n\
+                 int       reduction, \\\n\
+                 float     ref_scale, \\\n\
+                 float     ref_tail, \\\n\
+                 float     update_scale, \\\n\
+                 float     update_tail, \\\n\
+                 float     output_zp, \\\n\
+                 int       inner_size, \\\n\
+                 int       axis_size, \\\n\
+                 int       outer_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\
+ \\\n\
+    Image ref_i = create_image_from_image2d(ref, 2); \\\n\
+    Image update_i = create_image_from_image2d(update, 2); \\\n\
+    Image indices_i = create_image_from_image2d(indices, 4); \\\n\
+    Image output_i = create_image_from_image2d(output, 2); \\\n\
+ \\\n\
+    dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \\\n\
+    dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \\\n\
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\
+    if (coord.y < outer_size) \\\n\
+    { \\\n\
+        dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \\\n\
+        int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \\\n\
+        for(int x = 0; x < axis_size; x ++) \\\n\
+        { \\\n\
+            int offset = indices_ptr[x]; \\\n\
+            if (offset == coord.x) \\\n\
+            { \\\n\
+                data *= conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \\\n\
+            } \\\n\
+        } \\\n\
+    } \\\n\
+ \\\n\
+    output_ptr[0] = data; \\\n\
+}\n\
+SE_MUL_AXIS0_16BITS_IMPL(I16_I32_I16toI16,    short,  convert_short_rte)\n\
+SE_MUL_AXIS0_16BITS_IMPL(F16_I32_F16toF16,    short,  convert_short)\n\
+SE_MUL_AXIS0_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)\n\
+\n\
+#define SE_MUL_AXIS0_8BITS_IMPL(name, dtype, conv_func) \\\n\
+__kernel void scatter_elements_mul_axis0_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_t ref, \\\n\
+    __read_only  image2d_t indices, \\\n\
+    __read_only  image2d_t update, \\\n\
+    __write_only image2d_t output, \\\n\
+                 int       axis, \\\n\
+                 int       reduction, \\\n\
+                 float     ref_scale, \\\n\
+                 float     ref_tail, \\\n\
+                 float     update_scale, \\\n\
+                 float     update_tail, \\\n\
+                 float     output_zp, \\\n\
+                 int       inner_size, \\\n\
+                 int       axis_size, \\\n\
+                 int       outer_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), 0); \\\n\
+ \\\n\
+    Image ref_i = create_image_from_image2d(ref, 1); \\\n\
+    Image update_i = create_image_from_image2d(update, 1); \\\n\
+    Image indices_i = create_image_from_image2d(indices, 4); \\\n\
+    Image output_i = create_image_from_image2d(output, 1); \\\n\
+ \\\n\
+    dtype *ref_ptr = (dtype *)get_image_ptr_from_coord(ref_i, coord.xy); \\\n\
+    dtype *output_ptr = (dtype *)get_image_ptr_from_coord(output_i, coord.xy); \\\n\
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\
+    if (coord.y < outer_size) \\\n\
+    { \\\n\
+        dtype *update_ptr = (dtype *)get_image_ptr_from_coord(update_i, coord.wy); \\\n\
+        int *indices_ptr = (int *)get_image_ptr_from_coord(indices_i, coord.wy); \\\n\
+        for(int x = 0; x < axis_size; x ++) \\\n\
+        { \\\n\
+            int offset = indices_ptr[x]; \\\n\
+            if (offset == coord.x) \\\n\
+            { \\\n\
+                data *= conv_func(convert_float(update_ptr[x]) * update_scale + update_tail + output_zp); \\\n\
+            } \\\n\
+        } \\\n\
+    } \\\n\
+ \\\n\
+    output_ptr[0] = data; \\\n\
+}\n\
+SE_MUL_AXIS0_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)\n\
+SE_MUL_AXIS0_8BITS_IMPL(I8_I32_I8toI8, char,  convert_char)\n\
+\n\
+#define SE_MUL_AXIS1_32BITS_IMPL(name, dtype) \\\n\
+__kernel void scatter_elements_mul_axis1_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t ref, \\\n\
+    __read_only  image2d_array_t indices, \\\n\
+    __read_only  image2d_array_t update, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int             axis, \\\n\
+                 int             reduction, \\\n\
+                 float           ref_scale, \\\n\
+                 float           ref_tail, \\\n\
+                 float           update_scale, \\\n\
+                 float           update_tail, \\\n\
+                 float           output_zp, \\\n\
+                 int             inner_size, \\\n\
+                 int             axis_size, \\\n\
+                 int             outer_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    Tensor ref_i = create_tensor_from_image2d_array(ref, 4); \\\n\
+    Tensor update_i = create_tensor_from_image2d_array(update, 4); \\\n\
+    Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \\\n\
+    Tensor output_i = create_tensor_from_image2d_array(output, 4); \\\n\
+ \\\n\
+    dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \\\n\
+    dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \\\n\
+    dtype data = ref_ptr[0]; \\\n\
+    if (coord.x < inner_size && coord.z < outer_size) \\\n\
+    { \\\n\
+        dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \\\n\
+        int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \\\n\
+        for(int y = 0; y < axis_size; y ++) \\\n\
+        { \\\n\
+            int offset = indices_ptr[y * inner_size]; \\\n\
+            if (offset == coord.y) \\\n\
+            { \\\n\
+                data *= update_ptr[y * inner_size]; \\\n\
+            } \\\n\
+        } \\\n\
+    } \\\n\
+ \\\n\
+    output_ptr[0] = data; \\\n\
+}\n\
+SE_MUL_AXIS1_32BITS_IMPL(F32_I32_F32toF32, float)\n\
+SE_MUL_AXIS1_32BITS_IMPL(I32_I32_I32toI32, int)\n\
+\n\
+#define SE_MUL_AXIS1_16BITS_IMPL(name, dtype, conv_func) \\\n\
+__kernel void scatter_elements_mul_axis1_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t ref, \\\n\
+    __read_only  image2d_array_t indices, \\\n\
+    __read_only  image2d_array_t update, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int             axis, \\\n\
+                 int             reduction, \\\n\
+                 float           ref_scale, \\\n\
+                 float           ref_tail, \\\n\
+                 float           update_scale, \\\n\
+                 float           update_tail, \\\n\
+                 float           output_zp, \\\n\
+                 int             inner_size, \\\n\
+                 int             axis_size, \\\n\
+                 int             outer_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    Tensor ref_i = create_tensor_from_image2d_array(ref, 2); \\\n\
+    Tensor update_i = create_tensor_from_image2d_array(update, 2); \\\n\
+    Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \\\n\
+    Tensor output_i = create_tensor_from_image2d_array(output, 2); \\\n\
+ \\\n\
+    dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \\\n\
+    dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \\\n\
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\
+    if (coord.x < inner_size && coord.z < outer_size) \\\n\
+    { \\\n\
+        dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \\\n\
+        int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \\\n\
+        for(int y = 0; y < axis_size; y ++) \\\n\
+        { \\\n\
+            int offset = indices_ptr[y * inner_size]; \\\n\
+            if (offset == coord.y) \\\n\
+            { \\\n\
+                data *= conv_func(convert_float(update_ptr[y * inner_size]) \\\n\
+                            * update_scale + update_tail + output_zp); \\\n\
+            } \\\n\
+        } \\\n\
+    } \\\n\
+ \\\n\
+    output_ptr[0] = data; \\\n\
+}\n\
+SE_MUL_AXIS1_16BITS_IMPL(I16_I32_I16toI16,    short,  convert_short_rte)\n\
+SE_MUL_AXIS1_16BITS_IMPL(F16_I32_F16toF16,    short,  convert_short)\n\
+SE_MUL_AXIS1_16BITS_IMPL(BF16_I32_BF16toBF16, ushort, convert_ushort)\n\
+\n\
+#define SE_MUL_AXIS1_8BITS_IMPL(name, dtype, conv_func) \\\n\
+__kernel void scatter_elements_mul_axis1_##name \\\n\
+    ( \\\n\
+    __read_only  image2d_array_t ref, \\\n\
+    __read_only  image2d_array_t indices, \\\n\
+    __read_only  image2d_array_t update, \\\n\
+    __write_only image2d_array_t output, \\\n\
+                 int             axis, \\\n\
+                 int             reduction, \\\n\
+                 float           ref_scale, \\\n\
+                 float           ref_tail, \\\n\
+                 float           update_scale, \\\n\
+                 float           update_tail, \\\n\
+                 float           output_zp, \\\n\
+                 int             inner_size, \\\n\
+                 int             axis_size, \\\n\
+                 int             outer_size \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    Tensor ref_i = create_tensor_from_image2d_array(ref, 1); \\\n\
+    Tensor update_i = create_tensor_from_image2d_array(update, 1); \\\n\
+    Tensor indices_i = create_tensor_from_image2d_array(indices, 4); \\\n\
+    Tensor output_i = create_tensor_from_image2d_array(output, 1); \\\n\
+ \\\n\
+    dtype *ref_ptr = (dtype *)get_tensor_ptr_from_coord(ref_i, coord); \\\n\
+    dtype *output_ptr = (dtype *)get_tensor_ptr_from_coord(output_i, coord); \\\n\
+    dtype data = conv_func(convert_float(ref_ptr[0]) * ref_scale + ref_tail + output_zp); \\\n\
+    if (coord.x < inner_size && coord.z < outer_size) \\\n\
+    { \\\n\
+        dtype *update_ptr = (dtype *)get_tensor_ptr_from_coord(update_i, coord.xwzw); \\\n\
+        int *indices_ptr = (int *)get_tensor_ptr_from_coord(indices_i, coord.xwzw); \\\n\
+        for(int y = 0; y < axis_size; y ++) \\\n\
+        { \\\n\
+            int offset = indices_ptr[y * inner_size]; \\\n\
+            if (offset == coord.y) \\\n\
+            { \\\n\
+                data *= conv_func(convert_float(update_ptr[y * inner_size]) \\\n\
+                                * update_scale + update_tail + output_zp); \\\n\
+            } \\\n\
+        } \\\n\
+    } \\\n\
+ \\\n\
+    output_ptr[0] = data; \\\n\
+}\n\
+SE_MUL_AXIS1_8BITS_IMPL(U8_I32_U8toU8, uchar, convert_uchar_rte)\n\
+SE_MUL_AXIS1_8BITS_IMPL(I8_I32_I8toI8, char,  convert_char)\n\
+"; /* end of scatter_elements_mul_cl*/
+
 static const char scatter_nd_cl[] = "__kernel void scatter_nd_U32toU32_1D(\n\
     __read_only image2d_t   input0,\n\
     __read_only image2d_t   input1,\n\
@@ -60287,6 +61427,7 @@ static const source_map_t evis_resource[] =
     {"argmin_axis2_vx", argmin_axis2_vx},
     {"batchnorm_single_vx", batchnorm_single_vx},
     {"batchnorm_single_f32_vx", batchnorm_single_f32_vx},
+    {"bucketize_vx", bucketize_vx},
     {"cast_vx", cast_vx},
     {"clip_F16_vx", clip_F16_vx},
     {"clip_I16_vx", clip_I16_vx},
@@ -60413,9 +61554,8 @@ static const source_map_t evis_resource[] =
     {"pre_process_gray_vx", pre_process_gray_vx},
     {"pre_process_gray_2_vx", pre_process_gray_2_vx},
     {"pre_process_gray_copy_vx", pre_process_gray_copy_vx},
+    {"pre_process_nv12_copy_vx", pre_process_nv12_copy_vx},
     {"pre_process_nv12_scale_vx", pre_process_nv12_scale_vx},
-    {"pre_process_nv12_scale_8bits_vx", pre_process_nv12_scale_8bits_vx},
-    {"pre_process_nv12_scale_mix_vx", pre_process_nv12_scale_mix_vx},
     {"pre_process_rgb_vx", pre_process_rgb_vx},
     {"pre_process_rgb888_planar_0_vx", pre_process_rgb888_planar_0_vx},
     {"pre_process_rgb888_planar_1_vx", pre_process_rgb888_planar_1_vx},
@@ -60424,11 +61564,11 @@ static const source_map_t evis_resource[] =
     {"pre_process_rgb888_planar_sep_1_vx", pre_process_rgb888_planar_sep_1_vx},
     {"pre_process_rgb888_planar_sep_2_vx", pre_process_rgb888_planar_sep_2_vx},
     {"pre_process_rgb_copy_vx", pre_process_rgb_copy_vx},
-    {"pre_process_yuv420_copy_u8_vx", pre_process_yuv420_copy_u8_vx},
-    {"pre_process_yuv420_scale_fp16_vx", pre_process_yuv420_scale_fp16_vx},
-    {"pre_process_yuv420_scale_i16_vx", pre_process_yuv420_scale_i16_vx},
-    {"pre_process_yuv420_scale_i8_vx", pre_process_yuv420_scale_i8_vx},
-    {"pre_process_yuv420_scale_u8_vx", pre_process_yuv420_scale_u8_vx},
+    {"pre_process_yuv420_copy_vx", pre_process_yuv420_copy_vx},
+    {"pre_process_yuv420_scale_0_vx", pre_process_yuv420_scale_0_vx},
+    {"pre_process_yuv420_scale_1_vx", pre_process_yuv420_scale_1_vx},
+    {"pre_process_yuv422_copy_vx", pre_process_yuv422_copy_vx},
+    {"pre_process_yuv422_scale_vx", pre_process_yuv422_scale_vx},
     {"pre_process_yuv444_copy_u8_vx", pre_process_yuv444_copy_u8_vx},
     {"pre_process_yuv444_scale_vx", pre_process_yuv444_scale_vx},
     {"pre_process_yuv444_scale_fp16_vx", pre_process_yuv444_scale_fp16_vx},
@@ -60510,6 +61650,7 @@ static const source_map_t cl_resource[] =
     {"argmin_axis1_cl", argmin_axis1_cl},
     {"argmin_axis2_cl", argmin_axis2_cl},
     {"batchnorm_single_cl", batchnorm_single_cl},
+    {"bucketize_cl", bucketize_cl},
     {"cast_cl", cast_cl},
     {"clip_BF16_cl", clip_BF16_cl},
     {"clip_F32_cl", clip_F32_cl},
@@ -60549,6 +61690,7 @@ static const source_map_t cl_resource[] =
     {"log_softmax_axis2_cl", log_softmax_axis2_cl},
     {"logical_not_cl", logical_not_cl},
     {"logical_ops_cl", logical_ops_cl},
+    {"lppool_cl", lppool_cl},
     {"lstmunit_activation_BP_F32_cl", lstmunit_activation_BP_F32_cl},
     {"lstmunit_activation_BP_U8_cl", lstmunit_activation_BP_U8_cl},
     {"lstmunit_activation_B_F32_cl", lstmunit_activation_B_F32_cl},
@@ -60611,6 +61753,9 @@ static const source_map_t cl_resource[] =
     {"resize_bilinear_cl", resize_bilinear_cl},
     {"resize_nearest_cl", resize_nearest_cl},
     {"roi_align_cl", roi_align_cl},
+    {"scatter_elements_cl", scatter_elements_cl},
+    {"scatter_elements_add_cl", scatter_elements_add_cl},
+    {"scatter_elements_mul_cl", scatter_elements_mul_cl},
     {"scatter_nd_cl", scatter_nd_cl},
     {"scatter_nd_update_cl", scatter_nd_update_cl},
     {"select_cl", select_cl},
diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c
index 69f987a..8462aad 100644
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_vxkernel.c
@@ -478,7 +478,7 @@ vsi_status vsi_nn_ClientNodePassParameters
     )
 {
     vsi_status status;
-    uint8_t i;
+    uint32_t i;
 
     status = VSI_FAILURE;
     for( i = 0; i < num; i++ )
diff --git a/src/tim/vx/internal/src/makefile.linux b/src/tim/vx/internal/src/makefile.linux
index 45e11b8..e06adcf 100644
--- a/src/tim/vx/internal/src/makefile.linux
+++ b/src/tim/vx/internal/src/makefile.linux
@@ -1,8 +1,207 @@
+# to make ovxlib can compile both IDE and SKD
+# if you want to use IDE to compile : export USE_IDE_LIB=1
+# and VIVANTE_SDK_DIR=..../VeriSilicon/VivanteIDE5.4.0/cmdtools/vsimulator
+
+###################################################################################
+#common parts
+# OBJECTS.
+
+OBJECTS =   $(OBJ_DIR)/vsi_nn_context.o \
+            $(OBJ_DIR)/vsi_nn_client_op.o \
+            $(OBJ_DIR)/vsi_nn_graph.o  \
+            $(OBJ_DIR)/vsi_nn_node_attr_template.o  \
+            $(OBJ_DIR)/vsi_nn_node.o  \
+            $(OBJ_DIR)/vsi_nn_ops.o  \
+            $(OBJ_DIR)/vsi_nn_daemon.o  \
+            $(OBJ_DIR)/vsi_nn_tensor.o \
+            $(OBJ_DIR)/vsi_nn_version.o \
+            $(OBJ_DIR)/vsi_nn_rnn.o \
+            $(OBJ_DIR)/vsi_nn_rnn_helper.o \
+            $(OBJ_DIR)/vsi_nn_internal_node.o \
+            $(OBJ_DIR)/vsi_nn_log.o \
+            $(OBJ_DIR)/vsi_nn_graph_optimization.o \
+            $(OBJ_DIR)/vsi_nn_pre_post_process.o
+
+vpath %.c utils
+OBJECTS +=   $(OBJ_DIR)/vsi_nn_code_generator.o   \
+             $(OBJ_DIR)/vsi_nn_binary_tree.o   \
+             $(OBJ_DIR)/vsi_nn_map.o   \
+             $(OBJ_DIR)/vsi_nn_link_list.o   \
+             $(OBJ_DIR)/vsi_nn_math.o   \
+             $(OBJ_DIR)/vsi_nn_dtype_util.o   \
+             $(OBJ_DIR)/vsi_nn_shape_util.o   \
+             $(OBJ_DIR)/vsi_nn_dtype.o   \
+             $(OBJ_DIR)/vsi_nn_limits.o   \
+             $(OBJ_DIR)/vsi_nn_vdata.o   \
+             $(OBJ_DIR)/vsi_nn_util.o    \
+             $(OBJ_DIR)/vsi_nn_dlfcn.o    \
+             $(OBJ_DIR)/vsi_nn_constraint_check.o    \
+             $(OBJ_DIR)/vsi_nn_hashmap.o   \
+             $(OBJ_DIR)/vsi_nn_tensor_op.o
+
+vpath %.c quantization
+OBJECTS +=   $(OBJ_DIR)/vsi_nn_dynamic_fixed_point.o   \
+             $(OBJ_DIR)/vsi_nn_asymmetric_affine.o   \
+             $(OBJ_DIR)/vsi_nn_perchannel_symmetric_affine.o
+
+vpath %.c pycc
+OBJECTS +=      $(OBJ_DIR)/vsi_pycc_interface.o
+
+vpath %.c post
+OBJECTS +=      $(OBJ_DIR)/vsi_nn_post_fasterrcnn.o \
+                $(OBJ_DIR)/vsi_nn_post_cmupose.o
+
+vpath %.c libnnext
+OBJECTS += $(OBJ_DIR)/vsi_nn_libnnext_resource.o \
+			$(OBJ_DIR)/vsi_nn_vxkernel.o
+
+vpath %.c cpu_backend
+SRCS += ${notdir ${wildcard cpu_backend/*.c}}
+
+vpath %.c libnnext/ops/kernel
+SRCS += ${notdir ${wildcard libnnext/ops/kernel/*.c}}
+
+vpath %.c ops
+SRCS += ${notdir ${wildcard ops/*.c}}
+
+vpath %.c kernel
+SRCS += ${notdir ${wildcard kernel/*.c}}
+
+vpath %.c kernel/cl
+SRCS += ${notdir ${wildcard kernel/cl/*.c}}
+
+vpath %.c kernel/cpu
+SRCS += ${notdir ${wildcard kernel/cpu/*.c}}
+
+vpath %.c kernel/evis
+SRCS += ${notdir ${wildcard kernel/evis/*.c}}
+
+vpath %.c kernel/vx
+SRCS += ${notdir ${wildcard kernel/vx/*.c}}
+
+vpath %.c kernel/sp
+SRCS += ${notdir ${wildcard kernel/sp/*.c}}
+
+vpath %.c custom/ops
+SRCS += ${notdir ${wildcard custom/ops/*.c}}
+
+vpath %.c custom/ops/kernel/evis
+SRCS += ${notdir ${wildcard custom/ops/kernel/evis/*.c}}
+
+vpath %.c custom/ops/kernel/cl
+SRCS += ${notdir ${wildcard custom/ops/kernel/cl/*.c}}
+
+vpath %.c custom/ops/kernel/cpu
+SRCS += ${notdir ${wildcard custom/ops/kernel/cpu/*.c}}
+
+vpath %.c custom/ops/kernel/sp
+SRCS += ${notdir ${wildcard custom/ops/kernel/sp/*.c}}
+
+OBJECTS +=  ${patsubst %.c, $(OBJ_DIR)/%.o, $(SRCS)}
+
+ifeq ($(USE_VIP_DEVICE),1)
+vpath %.cpp vip
+OBJECTS += $(OBJ_DIR)/virtual_device.o
+endif
+
+################################################################################
+ifeq ($(USE_IDE_LIB),1)
+# IDE.
+
+CC=$(CROSS_COMPILE)gcc
+
+INCLUDES=-I. -I$(VIVANTE_SDK_DIR)/include/ \
+ -I$(VIVANTE_SDK_DIR)/include/CL \
+ -I$(VIVANTE_SDK_DIR)/include/VX \
+ -I../include/ops -I../include/utils -I../include/inference \
+ -I../include/client -I../include -I../include/libnnext \
+ -I../include/cpu_backend
+
+ifeq (1,$(DEBUG))
+CFLAGS+=-g
+LFLAGS+=-g
+else
+CFLAGS+=-O3
+LFLAGS+=-O3
+endif
+CFLAGS += $(INCLUDES)
+CFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Werror -Wno-strict-aliasing -Wno-maybe-uninitialized
+CFLAGS += -fvisibility=hidden -D'OVXLIB_API=__attribute__((visibility("default")))'
+
+LIBS+= -L$(VIVANTE_SDK_DIR)/lib \
+ -lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy -lArchModelSw -lNNArchPerf
+LIBS+= -L$(VIVANTE_SDK_DIR)/lib/vsim \
+ -lOpenVX -lOpenVXU -lCLC -lVSC -lGAL  -lEmulator -lvdtproxy
+LIBS+= -L$(VIVANTE_SDK_DIR)/lib/x64_linux \
+ -lOpenVX -lOpenVXU -lCLC -lVSC -lGAL  -lEmulator -lvdtproxy
+LIBS+= -L$(VIVANTE_SDK_DIR)/lib/x64_linux/vsim \
+ -lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy
+LIBS+= -L$(VIVANTE_SDK_DIR)/lib/x64_linux/vsim \
+ -lOpenVX -lOpenVXU -lCLC -lVSC -lGAL -lEmulator -lvdtproxy
+LIBS+= -L$(VIVANTE_SDK_DIR)/../common/lib/ \
+ -lvdtproxy
+LIBS += -lm -ldl
+
+File = $(VIVANTE_SDK_DIR)/lib/libjpeg.a
+File2 = $(VIVANTE_SDK_DIR)/lib/x64_linux/libjpeg.a
+File3 = $(VIVANTE_SDK_DIR)/../common/lib/libjpeg.a
+ifeq ($(File),$(wildcard $(File)))
+LIBS+= $(File)
+else ifeq ($(File2),$(wildcard $(File2)))
+LIBS+= $(File2)
+else
+LIBS+= $(File3)
+endif
+
+###################################################################################
+# Macros.
+CFLAGS += -fPIC
+DYNAMIC      := 1
+TARGET_NAME  = libovxlib.so
+OBJ_DIR=bin_r
+TARGET_OUTPUT = $(OBJ_DIR)/$(TARGET_NAME)
+
+all: $(TARGET_OUTPUT)
+clean:
+	@rm -rf $(OBJ_DIR)/* $(OBJ_DIR)
+
+install: $(TARGET_OUTPUT)
+
+################################################################################
+
+LDFLAGS += -Wall -shared -Wl,-soname,$(TARGET_NAME) -Wl,-z,defs -fPIC
+
+ifeq ($(USE_VIP_DEVICE),1)
+LDFLAGS += -pthread
+LIBS += -lstdc++
+INCLUDE += -I../include/vip
+$(OBJ_DIR)/virtual_device.o: virtual_device.cpp
+	@echo "  COMPILE $(abspath $<)"
+	@mkdir -p $(OBJ_DIR)
+	@$(CXX) -c -std=c++14 -pthread $(CFLAGS) -o $@ $<
+endif
+
+$(TARGET_OUTPUT): $(OBJECTS)
+	@echo "  LINK    \033[1m$(notdir $@)\033[0m"
+	@$(CC) $(LDFLAGS) $(OBJECTS) -o $(TARGET_OUTPUT) $(LIBS)
+
+$(OBJ_DIR)/%.o: %.c
+	@echo "  COMPILE $(abspath $<)"
+	@mkdir -p $(OBJ_DIR)
+	@$(CC) -c $(CFLAGS) -o $@ $<
+
+else
+##################################################################################
+#SDK.
+
+# include common definition.
 include $(AQROOT)/makefile.linux.def
 
+#################################################################################
 INCLUDE += -I$(VIVANTE_SDK_INC) -I$(VIVANTE_SDK_INC)/HAL -I$(AQROOT)/sdk/inc
 INCLUDE += -I../include/ops -I../include/utils -I../include/inference
 INCLUDE += -I../include/client -I../include -I../include/libnnext
+INCLUDE += -I../include/cpu_backend
 
 CFLAGS += $(INCLUDE)
 CFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Werror
@@ -43,89 +242,6 @@ ifneq ($(gcdSTATIC_LINK), 1)
     endif
 endif
 #############################################################################
-# Objects.
-OBJECTS =   $(OBJ_DIR)/vsi_nn_context.o \
-            $(OBJ_DIR)/vsi_nn_client_op.o \
-            $(OBJ_DIR)/vsi_nn_graph.o  \
-            $(OBJ_DIR)/vsi_nn_node_attr_template.o  \
-            $(OBJ_DIR)/vsi_nn_node.o  \
-            $(OBJ_DIR)/vsi_nn_ops.o  \
-            $(OBJ_DIR)/vsi_nn_daemon.o  \
-            $(OBJ_DIR)/vsi_nn_tensor.o \
-            $(OBJ_DIR)/vsi_nn_version.o \
-            $(OBJ_DIR)/vsi_nn_rnn.o \
-            $(OBJ_DIR)/vsi_nn_rnn_helper.o \
-            $(OBJ_DIR)/vsi_nn_internal_node.o \
-            $(OBJ_DIR)/vsi_nn_log.o \
-            $(OBJ_DIR)/vsi_nn_graph_optimization.o \
-            $(OBJ_DIR)/vsi_nn_pre_post_process.o
-
-vpath %.c utils
-OBJECTS +=   $(OBJ_DIR)/vsi_nn_code_generator.o   \
-             $(OBJ_DIR)/vsi_nn_binary_tree.o   \
-             $(OBJ_DIR)/vsi_nn_map.o   \
-             $(OBJ_DIR)/vsi_nn_link_list.o   \
-             $(OBJ_DIR)/vsi_nn_math.o   \
-             $(OBJ_DIR)/vsi_nn_dtype_util.o   \
-             $(OBJ_DIR)/vsi_nn_shape_util.o   \
-             $(OBJ_DIR)/vsi_nn_dtype.o   \
-             $(OBJ_DIR)/vsi_nn_limits.o   \
-             $(OBJ_DIR)/vsi_nn_vdata.o   \
-             $(OBJ_DIR)/vsi_nn_util.o    \
-             $(OBJ_DIR)/vsi_nn_constraint_check.o    \
-             $(OBJ_DIR)/vsi_nn_hashmap.o   \
-             $(OBJ_DIR)/vsi_nn_tensor_op.o
-
-vpath %.c quantization
-OBJECTS +=   $(OBJ_DIR)/vsi_nn_dynamic_fixed_point.o   \
-             $(OBJ_DIR)/vsi_nn_asymmetric_affine.o   \
-             $(OBJ_DIR)/vsi_nn_perchannel_symmetric_affine.o
-
-vpath %.c pycc
-OBJECTS +=      $(OBJ_DIR)/vsi_pycc_interface.o
-
-vpath %.c post
-OBJECTS +=      $(OBJ_DIR)/vsi_nn_post_fasterrcnn.o \
-                $(OBJ_DIR)/vsi_nn_post_cmupose.o
-
-vpath %.c libnnext
-OBJECTS += $(OBJ_DIR)/vsi_nn_libnnext_resource.o \
-			$(OBJ_DIR)/vsi_nn_vxkernel.o
-
-vpath %.c libnnext/ops/kernel
-SRCS += ${notdir ${wildcard libnnext/ops/kernel/*.c}}
-
-vpath %.c ops
-SRCS += ${notdir ${wildcard ops/*.c}}
-
-vpath %.c kernel
-SRCS += ${notdir ${wildcard kernel/*.c}}
-
-vpath %.c kernel/cl
-SRCS += ${notdir ${wildcard kernel/cl/*.c}}
-
-vpath %.c kernel/cpu
-SRCS += ${notdir ${wildcard kernel/cpu/*.c}}
-
-vpath %.c kernel/evis
-SRCS += ${notdir ${wildcard kernel/evis/*.c}}
-
-vpath %.c kernel/vx
-SRCS += ${notdir ${wildcard kernel/vx/*.c}}
-
-vpath %.c custom/ops
-SRCS += ${notdir ${wildcard custom/ops/*.c}}
-
-vpath %.c custom/ops/kernel/evis
-SRCS += ${notdir ${wildcard custom/ops/kernel/evis/*.c}}
-
-vpath %.c custom/ops/kernel/cl
-SRCS += ${notdir ${wildcard custom/ops/kernel/cl/*.c}}
-
-vpath %.c custom/ops/kernel/cpu
-SRCS += ${notdir ${wildcard custom/ops/kernel/cpu/*.c}}
-
-OBJECTS +=  ${patsubst %.c, $(OBJ_DIR)/%.o, $(SRCS)}
 
 # installation directory
 INSTALL_DIR := $(VIVANTE_SDK_LIB)
@@ -133,4 +249,15 @@ INSTALL_DIR := $(VIVANTE_SDK_LIB)
 ################################################################################
 # Include the common makefile.
 
+ifeq ($(USE_VIP_DEVICE),1)
+LDFLAGS += -pthread
+LIBS += -lstdc++
+INCLUDE += -I../include/vip
+$(OBJ_DIR)/virtual_device.o: virtual_device.cpp
+	@echo "  COMPILE $(abspath $<)"
+	@mkdir -p $(OBJ_DIR)
+	@$(CXX) -c -std=c++14 -pthread $(CFLAGS) -o $@ $<
+endif
+
 include $(AQROOT)/common.target
+endif
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batch2space.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batch2space.c
index 24a3d14..fba4d05 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_batch2space.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batch2space.c
@@ -34,7 +34,7 @@
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_math.h"
-#include "vsi_nn_test.h"
+#include "vsi_nn_error.h"
 #include "utils/vsi_nn_constraint_check.h"
 
 static vsi_status op_compute
@@ -48,9 +48,39 @@ static vsi_status op_compute
     vx_nn_reorg_params_ext_t param;
     vsi_nn_tensor_t *block_size_tensor = NULL;
     vsi_nn_tensor_t *pad_tensor = NULL;
+    vsi_nn_tensor_t *input_tensor = NULL;
+    vsi_nn_tensor_t *output_tensor = NULL;
     vsi_nn_tensor_attr_t attr;
-    memset(&param, 0, sizeof(vx_nn_reorg_params_ext_t));
+    int32_t block_size[2] = {1, 1};
+    vsi_bool need_release_tensor = TRUE;
 
+    block_size[0] = self->nn_param.batch2space.block_size[0];
+    if (vsi_nn_is_3d_tensor(inputs[0]))
+    {
+        vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{1}};
+        memcpy(shape[0], inputs[0]->attr.size, sizeof(shape[0]));
+        memcpy(shape[1], outputs[0]->attr.size, sizeof(shape[1]));
+        shape[0][3] = shape[0][2];
+        shape[0][2] = shape[0][1];
+        shape[0][1] = 1;
+        shape[1][3] = shape[1][2];
+        shape[1][2] = shape[1][1];
+        shape[1][1] = 1;
+
+        input_tensor = vsi_nn_reshape_tensor(self->graph, inputs[0], shape[0], 4);
+        CHECK_PTR_FAIL_GOTO( input_tensor, "craete tensor fail.", final );
+        output_tensor = vsi_nn_reshape_tensor(self->graph, outputs[0], shape[1], 4);
+        CHECK_PTR_FAIL_GOTO( output_tensor, "craete tensor fail.", final );
+    }
+    else
+    {
+        block_size[1] = self->nn_param.batch2space.block_size[1];
+        need_release_tensor = FALSE;
+        input_tensor = inputs[0];
+        output_tensor = outputs[0];
+    }
+
+    memset(&param, 0, sizeof(vx_nn_reorg_params_ext_t));
     memset(&attr, 0, sizeof(attr));
     attr.size[0] = 2;
     attr.dim_num = 1;
@@ -59,9 +89,9 @@ static vsi_status op_compute
     attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
     block_size_tensor = vsi_nn_CreateTensorFromData(
         self->graph,
-        (uint8_t *)self->nn_param.batch2space.block_size,
+        (uint8_t *)block_size,
         &attr);
-    TEST_CHECK_PTR(block_size_tensor, final);
+    CHECK_PTR_FAIL_GOTO( block_size_tensor, "craete tensor fail.", final );
 
     memset(&attr, 0, sizeof(attr));
     attr.size[0] = 4;
@@ -73,16 +103,16 @@ static vsi_status op_compute
         self->graph,
         (uint8_t *)self->nn_param.batch2space.crop,
         &attr);
-    TEST_CHECK_PTR(pad_tensor, final);
+    CHECK_PTR_FAIL_GOTO( pad_tensor, "craete tensor fail.", final );
 
     param.base.block_size = REQUIRED_IO(block_size_tensor);
     param.pad = OPTIONAL_IO(pad_tensor);
     param.base.type = VX_REORG_BATCH_TO_SPACE_ND;
     self->n = vxReorgLayer2( self->graph->g,
-        inputs[0]->t,
+        input_tensor->t,
         (vx_nn_reorg_params_t *)&param,
         sizeof(vx_nn_reorg_params_ext_t),
-        outputs[0]->t);
+        output_tensor->t);
 
     if( NULL != self->n )
     {
@@ -90,8 +120,13 @@ static vsi_status op_compute
     }
 
 final:
-    if (block_size_tensor) vsi_nn_ReleaseTensor(&block_size_tensor);
-    if (pad_tensor) vsi_nn_ReleaseTensor(&pad_tensor);
+    if (need_release_tensor)
+    {
+        vsi_safe_release_tensor(input_tensor);
+        vsi_safe_release_tensor(output_tensor);
+    }
+    vsi_safe_release_tensor(block_size_tensor);
+    vsi_safe_release_tensor(pad_tensor);
 
     return status;
 } /* op_compute() */
@@ -105,14 +140,13 @@ static vsi_bool op_check
 {
     vsi_bool ret = FALSE;
 
-    if (inputs[0]->attr.dim_num != 4)
+    if (inputs[0]->attr.dim_num < 3)
     {
-        VSILOGE("batch2space only support 4D");
+        VSILOGE("The input tensor shape must be 3D or 4D!");
         return FALSE;
     }
 
-    if (self->nn_param.batch2space.block_size[0] < 0
-        || self->nn_param.batch2space.block_size[1] < 0)
+    if (self->nn_param.batch2space.block_size[0] < 0)
     {
         VSILOGE("Block size can't be less than zero in batch to space");
         return FALSE;
@@ -131,18 +165,33 @@ static vsi_bool op_setup
     )
 {
     vsi_nn_batch2space_param * p;
+
     p = (vsi_nn_batch2space_param *)&(self->nn_param.batch2space);
 
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
-        outputs[0]->attr.size[3] =
-            inputs[0]->attr.size[3] / p->block_size[0] / p->block_size[1];
-        outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
-        outputs[0]->attr.size[1] =
-            inputs[0]->attr.size[1] * p->block_size[1] - p->crop[2] - p->crop[3];
-        outputs[0]->attr.size[0] =
-            inputs[0]->attr.size[0] * p->block_size[0] - p->crop[0] - p->crop[1];
-        outputs[0]->attr.dim_num = 4;
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+
+        if (vsi_nn_is_3d_tensor(inputs[0]))
+        {
+            outputs[0]->attr.size[2] =
+                inputs[0]->attr.size[2] / p->block_size[0];
+            outputs[0]->attr.size[1] = inputs[0]->attr.size[1];
+            outputs[0]->attr.size[0] =
+                inputs[0]->attr.size[0] * p->block_size[0] - p->crop[0] - p->crop[1];
+        }
+        else
+        {
+            outputs[0]->attr.size[3] =
+                inputs[0]->attr.size[3] / p->block_size[0] / p->block_size[1];
+            outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
+            outputs[0]->attr.size[1] =
+                inputs[0]->attr.size[1] * p->block_size[1] - p->crop[2] - p->crop[3];
+            outputs[0]->attr.size[0] =
+                inputs[0]->attr.size[0] * p->block_size[0] - p->crop[0] - p->crop[1];
+        }
+
+
     }
 
     return TRUE;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c
index 8f81613..da6af26 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c
@@ -53,6 +53,7 @@ static vsi_bool setup_op_shapes
     vsi_size_t num_units =  0;
     vsi_size_t output_size = 0;
     vsi_size_t batch_size = 0;
+    vsi_bool use_virtual_tensor = TRUE;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     if( curr_param->time_major )
@@ -94,6 +95,28 @@ static vsi_bool setup_op_shapes
         inputs[BI_RNN_BW_INPUT_H_STATE] = output_tensor->t;
     }
 
+    if( !outputs[BI_RNN_FW_OUTPUT_H_STATE] )
+    {
+        memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+        attr.dim_num = VSI_NN_DIM_AUTO;
+        memcpy( &attr.dtype, &outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) );
+        attr.vtl = use_virtual_tensor;
+        attr.is_const = FALSE;
+        output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        outputs[BI_RNN_FW_OUTPUT_H_STATE] = output_tensor->t;
+    }
+
+    if( !outputs[BI_RNN_BW_OUTPUT_H_STATE] )
+    {
+        memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+        attr.dim_num = VSI_NN_DIM_AUTO;
+        memcpy( &attr.dtype, &outputs[BI_RNN_BW_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) );
+        attr.vtl = use_virtual_tensor;
+        attr.is_const = FALSE;
+        output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        outputs[BI_RNN_BW_OUTPUT_H_STATE] = output_tensor->t;
+    }
+
     /* output */
     if( VSI_NN_DIM_AUTO == outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dim_num )
     {
@@ -118,6 +141,26 @@ static vsi_bool setup_op_shapes
         }
     }
 
+    /* output_state_out */
+    if(VSI_NN_DIM_AUTO == outputs[BI_RNN_FW_OUTPUT_H_STATE]->attr.dim_num)
+    {
+        if( curr_param->merge_outputs )
+        {
+            outputs[BI_RNN_FW_OUTPUT_H_STATE]->attr.size[0] = output_size*2;
+            outputs[BI_RNN_FW_OUTPUT_H_STATE]->attr.size[1] = batch_size;
+            outputs[BI_RNN_FW_OUTPUT_H_STATE]->attr.dim_num = 2;
+        }
+        else
+        {
+            outputs[BI_RNN_FW_OUTPUT_H_STATE]->attr.size[0] = output_size;
+            outputs[BI_RNN_FW_OUTPUT_H_STATE]->attr.size[1] = batch_size;
+            outputs[BI_RNN_FW_OUTPUT_H_STATE]->attr.dim_num = 2;
+
+            outputs[BI_RNN_BW_OUTPUT_H_STATE]->attr.size[0] = output_size;
+            outputs[BI_RNN_BW_OUTPUT_H_STATE]->attr.size[1] = batch_size;
+            outputs[BI_RNN_BW_OUTPUT_H_STATE]->attr.dim_num = 2;
+        }
+    }
     return TRUE;
 }
 
@@ -292,10 +335,36 @@ static vsi_bool op_setup
 
         /* rnncell output h_state */
         vsi_nn_internal_init_tensor_attr(&attr,
-                &outputs[BI_RNN_FW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
+                &outputs[BI_RNN_FW_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
         rnncell_out1 = output_tensor->t;
 
+        if (reshape_output_tensors[time_step - 1 - i]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 &&
+            inputs[BI_RNN_BW_INPUT_WEIGHT_I]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 &&
+            curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_I].qnt_type == VSI_NN_QNT_TYPE_NONE &&
+            curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_I].vx_type == VSI_NN_TYPE_NONE)
+        {
+            curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_I].vx_type = VSI_NN_TYPE_FLOAT32;
+        }
+
+        if (last_step_h_state_fw &&
+            last_step_h_state_fw->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 &&
+            inputs[BI_RNN_BW_INPUT_WEIGHT_H]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 &&
+            curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_H].qnt_type == VSI_NN_QNT_TYPE_NONE &&
+            curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_H].vx_type == VSI_NN_TYPE_NONE)
+        {
+            curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_H].vx_type = VSI_NN_TYPE_FLOAT32;
+        }
+
+        if (has_aux_input&&
+            aux_reshape_output_tensors[time_step - 1 - i]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 &&
+            inputs[BI_RNN_BW_AUX_INPUT_WEIGHT]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 &&
+            curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_AUX].qnt_type == VSI_NN_QNT_TYPE_NONE &&
+            curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_AUX].vx_type == VSI_NN_TYPE_NONE)
+        {
+            curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_AUX].vx_type = VSI_NN_TYPE_FLOAT32;
+        }
+
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RNNCELL_OVXLIB, 0, 0 );
         curr->node->nn_param.rnncell_ovxlib.activation = curr_param->activation;
         memcpy( curr->node->nn_param.rnncell_ovxlib.internal_dtype,
@@ -307,8 +376,8 @@ static vsi_bool op_setup
         curr->inputs[RNNCELL_INPUT_WEIGHT_I] = inputs[BI_RNN_FW_INPUT_WEIGHT_I];
         curr->inputs[RNNCELL_INPUT_WEIGHT_H] = inputs[BI_RNN_FW_INPUT_WEIGHT_H];
 
-        curr->inputs[RNNCELL_INPUT_BIAS] = inputs[BI_RNN_FW_INPUT_BIAS];
-
+        curr->inputs[RNNCELL_INPUT_BIAS_I] = inputs[BI_RNN_FW_INPUT_BIAS_I];
+        curr->inputs[RNNCELL_INPUT_BIAS_H] = inputs[BI_RNN_FW_INPUT_BIAS_H];
         if (has_aux_input)
         {
             curr->inputs[RNNCELL_INPUT_AUX_INPUT] = aux_reshape_output_tensors[i];
@@ -348,23 +417,49 @@ static vsi_bool op_setup
 
         /* rnncell output h_state */
         vsi_nn_internal_init_tensor_attr(&attr,
-                &outputs[BI_RNN_BW_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor);
+                &outputs[BI_RNN_BW_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
         rnncell_out1 = output_tensor->t;
 
+        if (reshape_output_tensors[time_step - 1 - i]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 &&
+            inputs[BI_RNN_BW_INPUT_WEIGHT_I]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 &&
+            curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_I].qnt_type == VSI_NN_QNT_TYPE_NONE &&
+            curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_I].vx_type == VSI_NN_TYPE_NONE)
+        {
+            curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_I].vx_type = VSI_NN_TYPE_FLOAT32;
+        }
+
+        if (last_step_h_state_bw &&
+            last_step_h_state_bw->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 &&
+            inputs[BI_RNN_BW_INPUT_WEIGHT_H]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 &&
+            curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_H].qnt_type == VSI_NN_QNT_TYPE_NONE &&
+            curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_H].vx_type == VSI_NN_TYPE_NONE)
+        {
+            curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_H].vx_type = VSI_NN_TYPE_FLOAT32;
+        }
+
+        if (has_aux_input&&
+            aux_reshape_output_tensors[time_step - 1 - i]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 &&
+            inputs[BI_RNN_BW_AUX_INPUT_WEIGHT]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 &&
+            curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_AUX].qnt_type == VSI_NN_QNT_TYPE_NONE &&
+            curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_AUX].vx_type == VSI_NN_TYPE_NONE)
+        {
+            curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_AUX].vx_type = VSI_NN_TYPE_FLOAT32;
+        }
+
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RNNCELL_OVXLIB, 0, 0 );
         curr->node->nn_param.rnncell_ovxlib.activation = curr_param->activation;
         memcpy( curr->node->nn_param.rnncell_ovxlib.internal_dtype,
-            &(curr_param->internal_dtype[RNNCELL_QUANTIZE_PARAM_COUNT]),
-            sizeof(vsi_nn_dtype_t) * RNNCELL_QUANTIZE_PARAM_COUNT);
+                curr_param->internal_dtype,
+                sizeof(vsi_nn_dtype_t) * RNNCELL_QUANTIZE_PARAM_COUNT);
         curr->inputs[RNNCELL_INPUT_INPUT] = reshape_output_tensors[time_step - 1 - i];
         curr->inputs[RNNCELL_INPUT_H_STATE] = last_step_h_state_bw;
 
         curr->inputs[RNNCELL_INPUT_WEIGHT_I] = inputs[BI_RNN_BW_INPUT_WEIGHT_I];
         curr->inputs[RNNCELL_INPUT_WEIGHT_H] = inputs[BI_RNN_BW_INPUT_WEIGHT_H];
 
-        curr->inputs[RNNCELL_INPUT_BIAS] = inputs[BI_RNN_BW_INPUT_BIAS];
-
+        curr->inputs[RNNCELL_INPUT_BIAS_I] = inputs[BI_RNN_BW_INPUT_BIAS_I];
+        curr->inputs[RNNCELL_INPUT_BIAS_H] = inputs[BI_RNN_BW_INPUT_BIAS_H];
         if(has_aux_input)
         {
             curr->inputs[RNNCELL_INPUT_AUX_INPUT] = aux_reshape_output_tensors[time_step - 1 - i];
@@ -454,6 +549,15 @@ static vsi_bool op_setup
             tensor = output_tensor->t;
         }
 
+        /* forward output state*/
+        if (outputs[BI_RNN_FW_OUTPUT_H_STATE] != NULL)
+        {
+            curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+            curr->inputs[0] = last_step_h_state_fw;
+            curr->outputs[0] = outputs[BI_RNN_FW_OUTPUT_H_STATE];
+            vsi_nn_internal_setup_node(self, curr);
+        }
+
         /* concat rnncell output, the rnn's output is 3-dims */
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 );
         curr->node->nn_param.concat.axis = 2;
@@ -482,6 +586,15 @@ static vsi_bool op_setup
             tensor = output_tensor->t;
         }
 
+        /* backward output state*/
+        if (outputs[BI_RNN_BW_OUTPUT_H_STATE] != NULL)
+        {
+            curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+            curr->inputs[0] = last_step_h_state_bw;
+            curr->outputs[0] = outputs[BI_RNN_BW_OUTPUT_H_STATE];
+            vsi_nn_internal_setup_node(self, curr);
+        }
+
         /* concat rnncell output, the rnn's output is 3-dims */
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 );
         curr->node->nn_param.concat.axis = 2;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bucketize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bucketize.c
new file mode 100644
index 0000000..cac99d0
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bucketize.c
@@ -0,0 +1,208 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_platform.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "utils/vsi_nn_math.h"
+#include "utils/vsi_nn_util.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+typedef struct _bucketize_local_data_t {
+    int32_t placeholder;
+} bucketize_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    int32_t right = self->nn_param.bucketize.right;
+    vsi_nn_tensor_t* reshape_tensors[3] = { NULL };
+    vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_size_t new_rank = 0;
+    vsi_bool ret = TRUE;
+    vsi_nn_kernel_param_t * param = NULL;
+
+    param = vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_int32( param, "right",  right );
+
+    ret = vsi_nn_kernel_optimize_element_shape(
+            inputs[0]->attr.size, inputs[0]->attr.dim_num,
+            shape, &new_rank );
+
+    if ( ret )
+    {
+        reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
+                inputs[0], shape, new_rank );
+        reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph,
+                outputs[0], shape, new_rank );
+        shape[0] = inputs[1]->attr.size[0];
+        shape[1] = 1;
+        reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
+                inputs[1], shape, 2 );
+
+        self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+                "bucketize",
+                &reshape_tensors[0], 2,
+                &reshape_tensors[2], 1, param );
+
+        vsi_safe_release_tensor( reshape_tensors[0] );
+        vsi_safe_release_tensor( reshape_tensors[1] );
+        vsi_safe_release_tensor( reshape_tensors[2] );
+    }
+
+    if ( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    vsi_nn_kernel_param_release( &param );
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(BUCKETIZE, 2, 1)
+        IO_TYPE(D_U32,          D_U32,          D_I32)
+        IO_TYPE(D_I32,          D_I32,          D_I32)
+        IO_TYPE(D_F32,          D_F32,          D_I32)
+        IO_TYPE(D_F16,          D_F16,          D_I32)
+        IO_TYPE(D_BF16,         D_BF16,         D_I32)
+        IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM,   D_I32)
+        IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM,    D_I32)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I32)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_I32)
+        IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM,    D_I32)
+        IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM,     D_I32)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I32)
+    END_IO_TYPE_DECL(BUCKETIZE)
+    if (!VALIDATE_OP_IO_TYPES(BUCKETIZE, self, inputs, self->input.num, outputs, self->output.num))
+    {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    uint32_t i, out_rank;
+    vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_bool ret = TRUE;
+
+    out_rank = inputs[0]->attr.dim_num;
+
+    for (i = 0; i < out_rank; i++)
+    {
+        shape[i] = inputs[0]->attr.size[i];
+    }
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        outputs[0]->attr.dim_num = out_rank;
+        memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) );
+    }
+    else
+    {
+        vsi_size_t total_size_got;
+        vsi_size_t total_size_expected;
+        total_size_expected = vsi_nn_ShapeProduct( shape, out_rank );
+        total_size_got = vsi_nn_ShapeProduct( outputs[0]->attr.size,
+                outputs[0]->attr.dim_num );
+        if ( total_size_expected != total_size_got )
+        {
+            VSILOGW("Output size mismatch, expect %"VSI_SIZE_T_SPECIFIER", but got %"VSI_SIZE_T_SPECIFIER"",
+                    total_size_expected, total_size_got);
+            ret = FALSE;
+        }
+    }
+
+    return ret;
+} /* op_setup() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t* self
+    )
+{
+    self->nn_param.bucketize.right = FALSE;
+
+    return VSI_SUCCESS;
+} /* op_init() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ BUCKETIZE,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
index b2b01f5..4f55660 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
@@ -35,11 +35,11 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "vsi_nn_internal_node.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "utils/vsi_nn_dtype_util.h"
 
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
@@ -65,14 +65,24 @@ static vsi_status op_compute
         vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
         vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 };
         vsi_size_t new_rank = 0;
-        vsi_bool ret;
+        vsi_bool ret = TRUE;
         vsi_nn_kernel_param_t * param = NULL;
 
         param = vsi_nn_kernel_param_create();
 
-        ret = vsi_nn_kernel_optimize_element_shape(
-                inputs[0]->attr.size, inputs[0]->attr.dim_num,
-                shape, &new_rank );
+        if ( vsi_nn_TypeGetBits(inputs[0]->attr.dtype.vx_type) == 4 ||
+             vsi_nn_TypeGetBits(outputs[0]->attr.dtype.vx_type) == 4 )
+        {
+            new_rank = inputs[0]->attr.dim_num;
+            memcpy(shape, inputs[0]->attr.size, sizeof(inputs[0]->attr.size));
+        }
+        else
+        {
+            ret = vsi_nn_kernel_optimize_element_shape(
+                    inputs[0]->attr.size, inputs[0]->attr.dim_num,
+                    shape, &new_rank );
+        }
+
 
         vsi_nn_kernel_param_add_float32( param, "min_value",  min_value );
         vsi_nn_kernel_param_add_float32( param, "max_value",  max_value );
@@ -154,8 +164,11 @@ static vsi_bool op_check
 
         /* HW 9.1.1 */
         IO_TYPE(D_U4|Q_ASYM,    D_U4|Q_ASYM)
+        IO_TYPE(D_U4|Q_ASYM,    D_I4|Q_ASYM)
+        IO_TYPE(D_U4|Q_ASYM,    D_I4|Q_SYM)
         IO_TYPE(D_U4|Q_SYM,     D_U4|Q_SYM)
         IO_TYPE(D_I4|Q_ASYM,    D_I4|Q_ASYM)
+        IO_TYPE(D_I4|Q_ASYM,    D_U4|Q_ASYM)
         IO_TYPE(D_I4|Q_SYM,     D_I4|Q_SYM)
 
     END_IO_TYPE_DECL(CLIP)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c
index 5ebe3cf..3b2cf21 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c
@@ -119,6 +119,7 @@ static vsi_status op_compute
         vsi_nn_kernel_param_add_int32( param, "rounding_policy", self->vx_param.rounding_policy );
         vsi_nn_kernel_param_add_int32( param,
                 "down_scale_size_rounding", self->vx_param.down_scale_size_rounding );
+        vsi_nn_kernel_param_add_int32( param, "pad_mode", vsi_nn_get_vx_pad_mode( self->nn_param.conv1d.pad_mode ) );
         self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "conv1d_ovxlib",
                 new_inputs, 3, outputs, 1, param );
 
@@ -136,6 +137,7 @@ static vsi_status op_compute
         vsi_nn_kernel_param_add_int32( param, "rounding_policy", self->vx_param.rounding_policy );
         vsi_nn_kernel_param_add_int32( param,
                 "down_scale_size_rounding", self->vx_param.down_scale_size_rounding );
+        vsi_nn_kernel_param_add_int32( param, "pad_mode", vsi_nn_get_vx_pad_mode( self->nn_param.conv1d.pad_mode ) );
         if( self->nn_param.conv1d.multiplier > 0 )
         {
             vsi_nn_kernel_param_add_int32( param, "multiplier",
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c
index ba50ffd..228b586 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c
@@ -57,6 +57,7 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_int32( param, "rounding_policy", self->vx_param.rounding_policy );
     vsi_nn_kernel_param_add_int32( param,
             "down_scale_size_rounding", self->vx_param.down_scale_size_rounding );
+    vsi_nn_kernel_param_add_int32( param, "pad_mode", vsi_nn_get_vx_pad_mode( self->nn_param.conv2d.pad_mode ) );
     if (self->nn_param.conv2d.multiplier != 0) {
         vsi_nn_kernel_param_add_int32( param, "multiplier",
             self->nn_param.conv2d.multiplier );
@@ -87,318 +88,95 @@ static vsi_bool op_check
     /* Check fl and scale*/
     ret = vsi_nn_QuantCheck(inputs[0], inputs[1], inputs[2]);
 
-    if(ret) {
+    if (ret) {
+        vsi_size_t kx = 1;
+        vsi_size_t ky = 1;
         /* check inputs outputs data type */
-        BEGIN_IO_TYPE_DECL(CONV2D, 3, 1)
-            /* IO_TYPE(INPUT, WEIGHT, BIAS, OUTPUT) */
-            IO_TYPE(D_F32, D_F32, D_F32, D_F32)
-            IO_TYPE(D_F32, D_F32, D_F32, D_F16)
+        BEGIN_IO_TYPE_DECL(CONV2D, 2, 0)
+            /* IO_TYPE(INPUT, WEIGHT) */
+            IO_TYPE(D_F32, D_F32)
+            IO_TYPE(D_F16, D_F16)
 
-            IO_TYPE(D_F16, D_F16, D_F16, D_F16)
-            IO_TYPE(D_F16, D_F16, D_F32, D_F16)
+            IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP)
 
-            IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_F16)
-
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_F16)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_F16)
-
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16)
-
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_ASYM, D_U8|Q_ASYM)
-
-            IO_TYPE(D_BF16, D_BF16, D_F32, D_F32)
-            IO_TYPE(D_BF16, D_BF16, D_F32, D_BF16)
-
-            IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_ASYM)
-            IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_I8|Q_SYM)
-
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM)
-
-            /* IO_TYPE(INPUT, WEIGHT, NULL, OUTPUT) */
-            IO_TYPE(D_F32, D_F32, D_NONE, D_F32)
-
-            IO_TYPE(D_F16, D_F16, D_NONE, D_F16)
-
-            IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_I8|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_F16)
-
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_I16|Q_DFP)
-
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_F16)
-
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_U8|Q_ASYM)
-
-            IO_TYPE(D_BF16, D_BF16, D_NONE, D_F32)
-            IO_TYPE(D_BF16, D_BF16, D_NONE, D_BF16)
-
-            IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I8|Q_ASYM)
-            IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_NONE, D_I8|Q_SYM)
-
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM)
+            IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP)
+            IO_TYPE(D_BF16,         D_BF16)
+            IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_SYM_PC)
+            IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
+            IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_SYM_PC)
+            IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_SYM_PC)
 
             /* HW 9.0 */
-            IO_TYPE(D_F32, D_BF16, D_F32, D_BF16)
-            IO_TYPE(D_F32, D_BF16, D_NONE, D_BF16)
+            IO_TYPE(D_F32, D_BF16)
             /* HW 9.0.1 */
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_NONE,          D_I8|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_NONE,          D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_NONE,          D_BF16)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_NONE,          D_F32)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I8|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_BF16)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F32)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_I8|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_F16)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_BF16)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_F32)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I8|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F16)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
-            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
-
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_I8|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_F16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_BF16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_F32)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_U8|Q_ASYM)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_I8|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_F16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_BF16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_F32)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_U8|Q_ASYM)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_I8|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_F16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_BF16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_F32)
-
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_I8|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_F16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_BF16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_F32)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U8|Q_ASYM)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I8|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_DFP)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
-
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_I8|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_F16)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_BF16)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_F32)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_NONE,          D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_NONE,          D_BF16)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_NONE,          D_F32)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I8|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F16)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_BF16)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F32)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_BF16)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_F32)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_BF16)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_F32)
-
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_I8|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_I16|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_F16)
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_BF16)
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_NONE,          D_F32)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_NONE,          D_I16|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_NONE,          D_BF16)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_NONE,          D_F32)
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I8|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I16|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F16)
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_BF16)
-            IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F32)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_I8|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_I16|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_BF16)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_F32)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I64|Q_ASYM,    D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I64|Q_ASYM,    D_I8|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I64|Q_ASYM,    D_I16|Q_ASYM)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I64|Q_ASYM,    D_BF16)
-            IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM,    D_I64|Q_ASYM,    D_F32)
-
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_I8|Q_ASYM)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_I16|Q_ASYM)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_F16)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_BF16)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_NONE,          D_F32)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_ASYM,    D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_ASYM,    D_NONE,          D_I16|Q_ASYM)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_ASYM,    D_NONE,          D_BF16)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_ASYM,    D_NONE,          D_F32)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_SYM,     D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_SYM,     D_I8|Q_SYM)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_SYM,     D_I16|Q_SYM)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_SYM,     D_F16)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_SYM,     D_BF16)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_SYM,     D_F32)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_I8|Q_SYM)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_I16|Q_SYM)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_BF16)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_F32)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I64|Q_SYM,     D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I64|Q_SYM,     D_I8|Q_SYM)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I64|Q_SYM,     D_I16|Q_SYM)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I64|Q_SYM,     D_BF16)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM,     D_I64|Q_SYM,     D_F32)
-
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_I8|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_F16)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_BF16)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_NONE,          D_F32)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_NONE,          D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_NONE,          D_BF16)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_NONE,          D_F32)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I8|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F16)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
-            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_DFP)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
-            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
-
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_NONE,          D_I8|Q_SYM)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_NONE,          D_I16|Q_SYM)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_NONE,          D_F16)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_NONE,          D_BF16)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_NONE,          D_F32)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM_PC,  D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM_PC,  D_NONE,          D_I8|Q_SYM)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM_PC,  D_NONE,          D_I16|Q_SYM)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM_PC,  D_NONE,          D_BF16)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM_PC,  D_NONE,          D_F32)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I8|Q_SYM)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_SYM)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F16)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
-            IO_TYPE(D_I8|Q_SYM,  D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U8|Q_ASYM)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I8|Q_SYM)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_SYM)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
-            IO_TYPE(D_I8|Q_SYM,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
-
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_NONE,          D_I8|Q_DFP)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_NONE,          D_BF16)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_NONE,          D_F32)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I32|Q_DFP,     D_U8|Q_ASYM)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I32|Q_DFP,     D_I8|Q_DFP)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I32|Q_DFP,     D_BF16)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I32|Q_DFP,     D_F32)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I64|Q_DFP,     D_U8|Q_ASYM)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I64|Q_DFP,     D_I8|Q_DFP)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I64|Q_DFP,     D_BF16)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,    D_I64|Q_DFP,     D_F32)
-
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE,          D_U8|Q_ASYM)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE,          D_I8|Q_DFP)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE,          D_BF16)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE,          D_F32)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC,  D_U8|Q_ASYM)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC,  D_I8|Q_DFP)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC,  D_BF16)
-            IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC,  D_F32)
-
-            IO_TYPE(D_F16,       D_F16,          D_NONE,          D_BF16)
-            IO_TYPE(D_F16,       D_F16,          D_NONE,          D_F32)
-            IO_TYPE(D_F16,       D_F16,          D_F32,           D_BF16)
-            IO_TYPE(D_F16,       D_F16,          D_F32,           D_F32)
-
-            IO_TYPE(D_BF16,      D_BF16,         D_NONE,          D_F16)
-            IO_TYPE(D_BF16,      D_BF16,         D_F32,           D_F16)
-
-            IO_TYPE(D_F32,       D_BF16,         D_NONE,          D_F16)
-            IO_TYPE(D_F32,       D_BF16,         D_NONE,          D_BF16)
-            IO_TYPE(D_F32,       D_BF16,         D_NONE,          D_F32)
-            IO_TYPE(D_F32,       D_BF16,         D_F32,           D_F16)
-            IO_TYPE(D_F32,       D_BF16,         D_F32,           D_BF16)
-            IO_TYPE(D_F32,       D_BF16,         D_F32,           D_F32)
-
-            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM,     D_I32|Q_SYM,     D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_DFP,     D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,    D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,     D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,     D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM)
+            IO_TYPE(D_I8|Q_DFP,     D_U8|Q_SYM_PC)
+            IO_TYPE(D_I8|Q_DFP,     D_I8|Q_SYM_PC)
+            IO_TYPE(D_I8|Q_SYM,     D_U8|Q_SYM_PC)
+            IO_TYPE(D_I8|Q_SYM,     D_I8|Q_SYM_PC)
+            IO_TYPE(D_I16|Q_DFP,    D_I16|Q_SYM_PC)
+            IO_TYPE(D_F32,          D_BF16)
+            IO_TYPE(D_U8|Q_ASYM,    D_I8|Q_SYM)
 
             /* HW 9.1.1 */
-            IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U4|Q_ASYM)
-            IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I4|Q_ASYM)
-            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_U4|Q_ASYM)
-            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_I4|Q_ASYM)
-            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_ASYM,    D_U4|Q_ASYM)
-            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_ASYM,    D_I4|Q_ASYM)
-            IO_TYPE(D_I4|Q_ASYM, D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_I4|Q_ASYM)
-            IO_TYPE(D_I4|Q_ASYM, D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_U4|Q_ASYM)
-            IO_TYPE(D_I4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I4|Q_ASYM)
-            IO_TYPE(D_I4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U4|Q_ASYM)
-            IO_TYPE(D_I4|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I4|Q_ASYM)
-            IO_TYPE(D_I4|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U4|Q_ASYM)
-            IO_TYPE(D_I4|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_I4|Q_DFP)
+            IO_TYPE(D_U4|Q_ASYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM,    D_I8|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM,    D_I8|Q_SYM_PC)
+            IO_TYPE(D_I4|Q_ASYM,    D_I8|Q_ASYM)
+            IO_TYPE(D_I4|Q_ASYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_I4|Q_ASYM,    D_I8|Q_SYM_PC)
+            IO_TYPE(D_I4|Q_ASYM,    D_I8|Q_SYM_PC)
+            IO_TYPE(D_I4|Q_DFP,     D_I8|Q_DFP)
 
-            IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_SYM,     D_U4|Q_ASYM)
-            IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_SYM,     D_I4|Q_ASYM)
-            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM,     D_I32|Q_SYM,     D_U4|Q_ASYM)
-            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM,     D_I32|Q_SYM,     D_I4|Q_SYM)
-            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM,     D_I4|Q_SYM)
-            IO_TYPE(D_I4|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_I4|Q_SYM)
-            IO_TYPE(D_I4|Q_SYM,  D_I8|Q_SYM,     D_I32|Q_SYM,     D_U4|Q_ASYM)
-            IO_TYPE(D_I4|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I4|Q_SYM)
-            IO_TYPE(D_I4|Q_SYM,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U4|Q_ASYM)
-            IO_TYPE(D_I4|Q_SYM,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I4|Q_SYM)
-            IO_TYPE(D_I4|Q_SYM,  D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U4|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM,    D_I8|Q_SYM)
+            IO_TYPE(D_U4|Q_ASYM,    D_I8|Q_SYM_PC)
+            IO_TYPE(D_I4|Q_SYM,     D_I8|Q_SYM)
+            IO_TYPE(D_I4|Q_SYM,     D_U8|Q_ASYM)
+            IO_TYPE(D_I4|Q_SYM,     D_I8|Q_SYM_PC)
+
+            IO_TYPE(D_I16|Q_ASYM,   D_I16|Q_ASYM)
+            IO_TYPE(D_I16|Q_SYM,    D_I16|Q_SYM)
+            IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_ASYM)
+            IO_TYPE(D_I8|Q_ASYM,    D_I16|Q_ASYM)
+            IO_TYPE(D_I16|Q_ASYM,   D_U8|Q_ASYM)
+            IO_TYPE(D_I16|Q_ASYM,   D_U8|Q_ASYM)
+            IO_TYPE(D_I16|Q_ASYM,   D_I8|Q_ASYM)
+            IO_TYPE(D_I16|Q_ASYM,   D_I8|Q_ASYM)
+
+            IO_TYPE(D_U8|Q_ASYM,    D_I16|Q_SYM)
+            IO_TYPE(D_I8|Q_SYM,     D_I16|Q_SYM)
+            IO_TYPE(D_I16|Q_SYM,    D_U8|Q_SYM)
+            IO_TYPE(D_I16|Q_SYM,    D_U8|Q_SYM)
+            IO_TYPE(D_I16|Q_SYM,    D_I8|Q_SYM)
+            IO_TYPE(D_I16|Q_SYM,    D_I8|Q_SYM)
+
+            IO_TYPE(D_I16|Q_ASYM,   D_I8|Q_SYM_PC)
+            IO_TYPE(D_I16|Q_ASYM,   D_U8|Q_SYM_PC)
+            IO_TYPE(D_I16|Q_SYM,    D_I8|Q_SYM_PC)
+            IO_TYPE(D_I16|Q_SYM,    D_U8|Q_SYM_PC)
 
         END_IO_TYPE_DECL(CONV2D)
-        ret = VALIDATE_OP_IO_TYPES(CONV2D, self, inputs, self->input.num, outputs, self->output.num);
-        if(!ret) {
+        ret = VALIDATE_OP_IO_TYPES(CONV2D, self, inputs, 2, outputs, 0);
+        if (!ret) {
             char* desc = generate_op_io_types_desc(inputs,
-                    self->input.num, outputs, self->output.num);
+                    2, outputs, 0);
             VSILOGE("Inputs/Outputs data type not support: %s", desc);
             destroy_op_io_types_desc(desc);
             return FALSE;
         }
 
         /* check parameters */
-        if(inputs[1]->attr.size[0] * inputs[1]->attr.size[1] > 6400) {
+        kx = inputs[1]->attr.size[0];
+        ky = inputs[1]->attr.dim_num == 3 ? 1 : inputs[1]->attr.size[1];
+        if (kx * ky > 6400) {
             VSILOGE("Kernel size should <= 6400.");
             return FALSE;
         }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
index 9ab2266..5af07e2 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
@@ -106,6 +106,7 @@ static vsi_nn_internal_tensor_t * create_input_conv
     input_conv->node->vx_param.overflow_policy = self->vx_param.overflow_policy;
     input_conv->node->vx_param.rounding_policy = self->vx_param.rounding_policy;
     input_conv->node->vx_param.down_scale_size_rounding = self->vx_param.down_scale_size_rounding;
+    input_conv->node->nn_param.conv2d.pad_mode = p->conv2d.pad_mode;
 
     input_conv->inputs[0] = input;
     input_conv->inputs[1] = weight;
@@ -167,6 +168,7 @@ static vsi_nn_internal_tensor_t * create_recurrent_conv
     recurrent_conv->node->vx_param.overflow_policy = self->vx_param.overflow_policy;
     recurrent_conv->node->vx_param.rounding_policy = self->vx_param.rounding_policy;
     recurrent_conv->node->vx_param.down_scale_size_rounding = self->vx_param.down_scale_size_rounding;
+    recurrent_conv->node->nn_param.conv2d.pad_mode = p->conv2d.pad_mode;
 
     recurrent_conv->inputs[0] = input;
     recurrent_conv->inputs[1] = weight;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c
index 063dbd0..2b470b1 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c
@@ -77,6 +77,7 @@ static vsi_status op_compute
     MAP_PARAM("overflow_policy",self->vx_param.overflow_policy);
     MAP_PARAM("rounding_policy",self->vx_param.rounding_policy);
     MAP_PARAM("down_scale_size_rounding",self->vx_param.down_scale_size_rounding);
+    MAP_PARAM("pad_mode", vsi_nn_get_vx_pad_mode( self->nn_param.conv3d.pad_mode ) );
 
     if ( self->nn_param.conv3d.dilation[0] *
          self->nn_param.conv3d.dilation[1] *
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
index e18c4bd..8d8ff5f 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
@@ -92,6 +92,11 @@ static vsi_status op_optimize
 
     status = VSI_SUCCESS;
 
+    if( !self->graph->ctx->options.enable_dataconvert_optimize )
+    {
+        return status;
+    }
+
     if ( _is_same_quant(self, inputs, outputs) == FALSE ||
         (inputs[0]->t != NULL && outputs[0]->t != NULL))
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c
index 0692666..4128480 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c
@@ -35,6 +35,7 @@
 #include "vsi_nn_log.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "kernel/vsi_nn_kernel.h"
+#include "vsi_nn_error.h"
 
 #define COMPUTE_DECONV_SZ( in, ksize, pad_1, pad_2, stride, output_padding )\
     (( in - 1 ) * stride + ksize - pad_1 - pad_2 + output_padding)
@@ -66,20 +67,18 @@ static vsi_status op_compute
     if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
     {
         weight_tensor = vsi_nn_reshape_tensor( self->graph, inputs[1], weight_attr.size, 4 );
+        CHECK_PTR_FAIL_GOTO( weight_tensor, "create tensor fail.", final );
     }
     else
     {
         uint8_t * data = NULL;
         data = vsi_nn_ConvertTensorToData( self->graph, inputs[1] );
-        if (NULL == data)
-        {
-            VSILOGE("Convert data fail.\n");
-            status = VSI_FAILURE;
-            return status;
-        }
+        CHECK_PTR_FAIL_GOTO( data, "Convert data fail.", final );
+
         weight_attr.dtype.channel_dim = inputs[1]->attr.dtype.channel_dim + 1;
         weight_tensor = vsi_nn_CreateTensorFromData(self->graph, data, &weight_attr);
         vsi_nn_safe_free( data );
+        CHECK_PTR_FAIL_GOTO( weight_tensor, "create tensor fail.", final );
     }
 
 #ifdef VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_depthwise_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_depthwise_conv1d.c
index 2a9f688..498c869 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_depthwise_conv1d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_depthwise_conv1d.c
@@ -68,6 +68,8 @@ static vsi_status op_compute
             self->vx_param.rounding_policy );
     vsi_nn_kernel_param_add_int32( param, "down_scale_size_rounding",
             self->vx_param.down_scale_size_rounding );
+    vsi_nn_kernel_param_add_int32( param, "pad_mode",
+            vsi_nn_get_vx_pad_mode( self->nn_param.depthwise_conv1d.pad_mode ) );
     self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "depthwise_conv1d",
             inputs, 3, outputs, 1, param );
     if( self->n )
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
index 73ba406..aea2f63 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
@@ -38,6 +38,14 @@
 #include "kernel/vsi_nn_kernel_eltwise.h"
 #include "utils/vsi_nn_constraint_check.h"
 
+vsi_bool vsi_nn_kernel_is_supported_types
+    (
+    vsi_nn_tensor_t** inputs,
+    size_t input_num,
+    vsi_nn_tensor_t** outputs,
+    size_t output_num
+    );
+
 static vsi_status _eltwise_op_compute
     (
     const char * kernel_name,
@@ -54,8 +62,9 @@ static vsi_status _eltwise_op_compute
     vx_bool doShapeOptimized = TRUE;
     vsi_nn_kernel_param_t * param = NULL;
     vsi_nn_context_t   ctx = NULL;
+    vsi_bool is_executed_on_sh = FALSE;
 
-    if( NULL == self )
+    if ( NULL == self )
     {
         return VSI_FAILURE;
     }
@@ -63,11 +72,15 @@ static vsi_status _eltwise_op_compute
 
     ctx = self->graph->ctx;
 
+    is_executed_on_sh = vsi_nn_kernel_is_supported_types(inputs, 2, outputs, 1) &&
+        !ctx->config.support_stream_processor;
+
     if ( strcmp(kernel_name, "sub") == 0
       || strcmp(kernel_name, "add") == 0
       || strcmp(kernel_name, "mul") == 0
-      || (strcmp(kernel_name, "maximum") == 0 && ctx->config.support_stream_processor)
-      || (strcmp(kernel_name, "minimum") == 0 && ctx->config.support_stream_processor))
+      || (strcmp(kernel_name, "maximum") == 0 && !is_executed_on_sh)
+      || (strcmp(kernel_name, "minimum") == 0 && !is_executed_on_sh)
+      || (strcmp(kernel_name, "div") == 0 && !is_executed_on_sh))
     {
         doShapeOptimized = FALSE;
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c
index 6bb4dad..1a2a3aa 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_expand_broadcast.c
@@ -118,7 +118,11 @@ static vsi_bool op_setup
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     attr.dim_num = p->dim_num;
-    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+    if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32) {
+        attr.dtype.vx_type = VSI_NN_TYPE_INT32;
+    } else {
+        attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+    }
     attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
     attr.is_const = TRUE;
     for(i = 0; i < p->dim_num; i++)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c
index 26d3380..3522896 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c
@@ -149,6 +149,7 @@ static vsi_bool op_setup
     curr->node->nn_param.grouped_conv2d.multiplier = p->multiplier;
     curr->node->nn_param.grouped_conv2d.weights = p->weights;
     curr->node->nn_param.grouped_conv2d.pad_type = p->pad_type;
+    curr->node->nn_param.grouped_conv2d.pad_mode = p->pad_mode;
 
     vsi_nn_internal_setup_node(self, curr);
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c
index 4f2ae60..f3818c7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c
@@ -172,6 +172,7 @@ static vsi_status op_compute
 
         p_ext->padding_x_right = self->nn_param.conv2d.pad[1];
         p_ext->padding_y_bottom = self->nn_param.conv2d.pad[3];
+        p_ext->pad_mode = vsi_nn_get_vx_pad_mode(nn_param->pad_mode);
 
         //set ext2 relative parameters
         p_ext2->depth_multiplier = self->nn_param.conv2d.multiplier;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
index 0f7baf9..e872a3d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c
@@ -181,10 +181,7 @@ static vsi_status op_compute
     p = &(self->nn_param.l2normalizescale);
     axis = p->axis;
 
-    if ( (inputs[1]->attr.is_const == TRUE && _check_value_is_equal_to_one(self->graph, inputs[1])) ||
-        ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 &&
-          outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 )
-        )
+    if ( self->nn_param.l2normalizescale.local.use_internal_node )
     {
         return vsi_nn_internal_compute_node( self );
     }
@@ -350,14 +347,16 @@ static vsi_bool op_setup
 
     if ( inputs[1]->attr.is_const == TRUE && _check_value_is_equal_to_one( self->graph, inputs[1] ) )
     {
+        self->nn_param.l2normalizescale.local.use_internal_node = TRUE;
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_L2_NORMALIZE, 0, 0);
         curr->node->nn_param.l2_normalize.axis = self->nn_param.l2normalizescale.axis;
         curr->inputs[0] = inputs[0];
         curr->outputs[0] = outputs[0];
         vsi_nn_internal_setup_node( self, curr );
     }
-    else if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 &&
-        outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 )
+    else if ( ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 &&
+                outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ) ||
+              self->graph->ctx->config.support_stream_processor )
     {
         vsi_nn_internal_tensor_t* output_tensor = NULL;
         vsi_nn_internal_tensor_t* reshape_tensor = NULL;
@@ -365,6 +364,8 @@ static vsi_bool op_setup
         int32_t dim_num = inputs[0]->attr.dim_num;
         int32_t i = 0;
 
+        self->nn_param.l2normalizescale.local.use_internal_node = TRUE;
+
         memcpy( &attr, &outputs[0]->attr, sizeof( attr ) );
         attr.vtl = TRUE;
         attr.is_const = FALSE;
@@ -382,7 +383,7 @@ static vsi_bool op_setup
             attr.size[i] = i == self->nn_param.l2normalizescale.axis ? inputs[0]->attr.size[i] : 1;
         }
         attr.dim_num = dim_num;
-        if (attr.dtype.vx_type != VSI_NN_TYPE_BFLOAT16)
+        if (attr.dtype.vx_type != VSI_NN_TYPE_BFLOAT16 && inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16)
         {
             attr.dtype.vx_type = VSI_NN_TYPE_BFLOAT16;
             attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
@@ -415,6 +416,8 @@ static vsi_status op_init
     vsi_status status = VSI_SUCCESS;
     uint32_t  i = 0;
 
+    self->nn_param.l2normalizescale.local.use_internal_node = FALSE;
+
     if (vsi_nn_compareVersion(self->graph, 1, 1, 13) == -1)
     {
         self->nn_param.l2normalizescale.axis = VSI_NN_L2NORMALIZESCALE_DEFAULT_AXIS;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
index 74623e2..f8330b7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
@@ -39,7 +39,6 @@
 
 #define _INPUT_NUM          (3)
 #define _OUTPUT_NUM         (1)
-#define VSI_NN_SUPPORT_AXIS (0)
 
 static vsi_status op_compute
     (
@@ -52,16 +51,17 @@ static vsi_status op_compute
     vsi_nn_kernel_param_t * param = NULL;
     vsi_nn_kernel_node_t    n = NULL;
     float eps = self->nn_param.layernorm.eps;
-#if VSI_NN_SUPPORT_AXIS
-    if ( 0 )
+    int32_t axis = self->nn_param.layernorm.axis;
+
+    if ( self->nn_param.layernorm.local->use_internal_node )
     {
         return vsi_nn_internal_compute_node( self );
     }
-#endif
 
     param = vsi_nn_kernel_param_create();
 
     vsi_nn_kernel_param_add_float32( param, "eps", eps );
+    vsi_nn_kernel_param_add_int32( param, "axis", axis );
     n = vsi_nn_kernel_selector( self->graph, "layer_norm",
                     inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
     if ( n != NULL )
@@ -86,39 +86,43 @@ static vsi_bool op_setup
     )
 {
     vsi_bool ret = TRUE;
-#if VSI_NN_SUPPORT_AXIS
+    int32_t axis = 0;
     vsi_nn_internal_node_t* curr = NULL;
-#endif
 
     if ( NULL == self )
     {
         return FALSE;
     }
-#if VSI_NN_SUPPORT_AXIS
+
+    axis = self->nn_param.layernorm.axis;
+
     vsi_nn_internal_init_node_wksp( self );
 
-   if ( 0 )
+    if ( axis != 0 && !self->graph->ctx->config.support_stream_processor)
     {
         vsi_nn_internal_tensor_t* mean_tensor = NULL;
         vsi_nn_internal_tensor_t* vari_tensor = NULL;
         vsi_nn_tensor_attr_t attr;
-        int32_t *axis = NULL;
+        int32_t *axis_array = NULL;
+
+        self->nn_param.layernorm.local->use_internal_node = TRUE;
 
         memcpy( &attr, &inputs[0]->attr, sizeof( attr ) );
-        attr.size[0] = 1;
+        attr.size[axis] = 1;
         attr.vtl = TRUE;
         attr.is_const = FALSE;
         attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
-        attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+        attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+
         mean_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
         vari_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
 
         curr = vsi_nn_internal_new_node(self, VSI_NN_OP_MOMENTS, 0, 0);
-        axis = (int32_t*)\
-                    vsi_nn_internal_new_node_param(curr, sizeof(int32_t) * 4);
-        axis[0] = 0;
+        axis_array = (int32_t*)\
+            vsi_nn_internal_new_node_param(curr, sizeof(int32_t) * VSI_NN_MAX_DIM_NUM);
+        axis_array[0] = axis;
 
-        curr->node->nn_param.moments.axis = axis;
+        curr->node->nn_param.moments.axis = axis_array;
         curr->node->nn_param.moments.axis_num = 1;
         curr->inputs[0] = inputs[0];
         curr->outputs[0] = mean_tensor->t;
@@ -136,7 +140,6 @@ static vsi_bool op_setup
         vsi_nn_internal_setup_node( self, curr );
     }
     else
-#endif
     {
         ret = vsi_nn_op_common_setup(self, inputs, outputs);
     }
@@ -211,14 +214,31 @@ static vsi_bool op_check
     return TRUE;
 } /* op_check() */
 
+static vsi_status op_init
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    self->nn_param.layernorm.axis = 0;
+
+    self->nn_param.layernorm.local = (vsi_nn_layernorm_lcl_data *)malloc(sizeof(vsi_nn_layernorm_lcl_data));
+    memset(self->nn_param.layernorm.local, 0x00, sizeof(vsi_nn_layernorm_lcl_data));
+    self->nn_param.layernorm.local->use_internal_node = FALSE;
+
+    return status;
+}
+
 static vsi_status op_deinit
     (
     vsi_nn_node_t * self
     )
 {
-#if VSI_NN_SUPPORT_AXIS
+    vsi_nn_safe_free(self->nn_param.layernorm.local);
+
     vsi_nn_internal_deinit_node_wksp( self );
-#endif
+
     vsi_nn_op_common_deinit(self);
 
     return VSI_SUCCESS;
@@ -231,7 +251,7 @@ extern "C" {
 DEF_OP_REG
     (
     /* op_name    */ LAYER_NORM,
-    /* init       */ NULL,
+    /* init       */ op_init,
     /* compute    */ op_compute,
     /* deinit     */ op_deinit,
     /* check      */ op_check,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lppool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lppool.c
new file mode 100644
index 0000000..1758ac1
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lppool.c
@@ -0,0 +1,259 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+typedef struct _lppool_local_data_t {
+    int32_t placeholder;
+} lppool_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t* param = NULL;
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    uint32_t i = 0;
+    uint32_t new_rank = 0;
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
+    int32_t ksize_x    = (int32_t)self->nn_param.lppool.ksize[0];
+    int32_t ksize_y    = (int32_t)self->nn_param.lppool.ksize[1];
+    int32_t p          = (int32_t)self->nn_param.lppool.p;
+    int32_t pad_left   = (int32_t)self->nn_param.lppool.pad[0];
+    int32_t pad_right  = (int32_t)self->nn_param.lppool.pad[1];
+    int32_t pad_top    = (int32_t)self->nn_param.lppool.pad[2];
+    int32_t pad_bottom = (int32_t)self->nn_param.lppool.pad[3];
+    int32_t stride_x   = (int32_t)self->nn_param.lppool.stride[0];
+    int32_t stride_y   = (int32_t)self->nn_param.lppool.stride[1];
+    new_rank = 3;
+
+    shapes[0][0] = inputs[0]->attr.size[0];
+    shapes[0][1] = inputs[0]->attr.size[1];
+    shapes[0][2] = inputs[0]->attr.size[2];
+    shapes[1][0] = outputs[0]->attr.size[0];
+    shapes[1][1] = outputs[0]->attr.size[1];
+    shapes[1][2] = outputs[0]->attr.size[2];
+
+    for (i = 3; i < inputs[0]->attr.dim_num; i++)
+    {
+        shapes[0][2] = shapes[0][2] * inputs[0]->attr.size[i];
+        shapes[1][2] = shapes[1][2] * outputs[0]->attr.size[i];
+    }
+
+    reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph,
+            inputs[0], shapes[0], new_rank );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph,
+            outputs[0], shapes[1], new_rank );
+
+    param = vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_int32(param, "ksize_x", ksize_x);
+    vsi_nn_kernel_param_add_int32(param, "ksize_y", ksize_y);
+    vsi_nn_kernel_param_add_int32(param, "pad_left", pad_left);
+    vsi_nn_kernel_param_add_int32(param, "pad_right", pad_right);
+    vsi_nn_kernel_param_add_int32(param, "pad_top", pad_top);
+    vsi_nn_kernel_param_add_int32(param, "pad_bottom", pad_bottom);
+    vsi_nn_kernel_param_add_int32(param, "stride_x", stride_x);
+    vsi_nn_kernel_param_add_int32(param, "stride_y", stride_y);
+    vsi_nn_kernel_param_add_int32(param, "p", p);
+
+    self->n = (vx_node)vsi_nn_kernel_selector(self->graph,"lppool",
+        &reshape_tensors[0],_INPUT_NUM,&reshape_tensors[1],_OUTPUT_NUM,param);
+    if( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    vsi_nn_kernel_param_release(&param);
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(LPPOOL, 1, 1)
+        IO_TYPE(D_F32,   D_F32)
+        IO_TYPE(D_F16,   D_F16)
+        IO_TYPE(D_BF16, D_BF16)
+        IO_TYPE(D_I16|Q_SYM,   D_I16|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,   D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,    D_I8|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,   D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,    D_I8|Q_DFP)
+        IO_TYPE(D_F32,   D_I16|Q_SYM)
+        IO_TYPE(D_F16,   D_I16|Q_SYM)
+        IO_TYPE(D_F32,   D_I16|Q_DFP)
+        IO_TYPE(D_F16,   D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_SYM,   D_F32)
+        IO_TYPE(D_U8|Q_ASYM,   D_F32)
+        IO_TYPE(D_I8|Q_SYM,    D_F32)
+        IO_TYPE(D_I16|Q_DFP,   D_F32)
+        IO_TYPE(D_I8|Q_DFP,    D_F32)
+        IO_TYPE(D_F32,   D_U8|Q_ASYM)
+        IO_TYPE(D_F16,   D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,   D_F16)
+        IO_TYPE(D_U8|Q_ASYM,   D_F16)
+        IO_TYPE(D_I8|Q_SYM,    D_F16)
+        IO_TYPE(D_I16|Q_DFP,   D_F16)
+        IO_TYPE(D_I8|Q_DFP,    D_F16)
+        IO_TYPE(D_F32,   D_I8|Q_SYM)
+        IO_TYPE(D_F16,   D_I8|Q_SYM)
+        IO_TYPE(D_F32,   D_I8|Q_DFP)
+        IO_TYPE(D_F16,   D_I8|Q_DFP)
+    END_IO_TYPE_DECL(LPPOOL)
+
+    if (!VALIDATE_OP_IO_TYPES(
+            LPPOOL, self, inputs, self->input.num, outputs, self->output.num))
+    {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_size_t ksize[_cnt_of_array(self->nn_param.lppool.ksize)] = {0};
+    vsi_size_t i = 0;
+    vsi_size_t pad[_cnt_of_array(self->nn_param.lppool.pad)] = {0};
+
+    for (i = 0; i < _cnt_of_array(self->nn_param.lppool.ksize); i++)
+    {
+        ksize[i] = self->nn_param.lppool.ksize[i];
+    }
+    for (i = 0; i < _cnt_of_array(self->nn_param.lppool.pad); i++)
+    {
+        pad[i] = self->nn_param.lppool.pad[i];
+    }
+
+    vsi_nn_compute_padding(
+        inputs[0]->attr.size,
+        ksize,
+        self->nn_param.lppool.stride,
+        NULL,
+        self->nn_param.lppool.pad_type,
+        pad
+    );
+    for (i = 0; i < _cnt_of_array(self->nn_param.lppool.ksize); i++)
+    {
+        self->nn_param.lppool.ksize[i] = (uint32_t)ksize[i];
+    }
+    for (i = 0; i < _cnt_of_array(self->nn_param.lppool.pad); i++)
+    {
+        self->nn_param.lppool.pad[i] = (uint32_t)pad[i];
+    }
+
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize
+            (
+            inputs[0]->attr.size[0],
+            self->nn_param.lppool.ksize[0],
+            &self->nn_param.lppool.pad[0],
+            self->nn_param.lppool.stride[0],
+            0,
+            VSI_NN_ROUND_CEIL
+            );
+        outputs[0]->attr.size[1] = vsi_nn_ComputeFilterSize
+            (
+            inputs[0]->attr.size[1],
+            self->nn_param.lppool.ksize[1],
+            &self->nn_param.lppool.pad[1],
+            self->nn_param.lppool.stride[1],
+            0,
+            VSI_NN_ROUND_CEIL
+            );
+        for (i = 2; i < outputs[0]->attr.dim_num; i++)
+        {
+            outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+        }
+    }
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    self->nn_param.lppool.p = 2;
+
+    return status;
+} /* op_init() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ LPPOOL,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
index a6c5c63..bcdc2d9 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
@@ -35,7 +35,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "libnnext/vsi_nn_vxkernel.h"
+#include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_util.h"
 
@@ -123,10 +123,11 @@ static vsi_bool op_setup
     int32_t ifco_start_index = 0;
     vsi_nn_tensor_attr_t attr;
     int32_t i = 0;
+    vsi_bool ret = FALSE;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
 
-    if( NULL == self )
+    if ( NULL == self )
     {
         return FALSE;
     }
@@ -160,13 +161,15 @@ static vsi_bool op_setup
             attr.size[1] = 1;
             attr.dim_num = 2;
             t0 = vsi_nn_reshape_tensor(self->graph, inputs[LSTMUNIT_ACT_DATA_BI + i], attr.size, attr.dim_num);
+            CHECK_PTR_FAIL_GOTO( t0, "create tensor fail.", final );
 
-            if( dst_dtype.vx_type != t0->attr.dtype.vx_type
+            if ( dst_dtype.vx_type != t0->attr.dtype.vx_type
                 && dst_dtype.qnt_type != t0->attr.dtype.qnt_type )
             {
                 p->local.tensors[LSTMUNIT_ACT_TENSOR_BI + i] =
                     vsi_nn_ConvertTensorDtype( self->graph, t0, &dst_dtype );
-                vsi_nn_ReleaseTensor( &t0 );
+
+                vsi_safe_release_tensor(t0);
             }
             else
             {
@@ -182,13 +185,14 @@ static vsi_bool op_setup
             attr.size[1] = 1;
             attr.dim_num = 2;
             t1 = vsi_nn_reshape_tensor(self->graph, inputs[LSTMUNIT_ACT_LN_WI + i], attr.size, attr.dim_num);
+            CHECK_PTR_FAIL_GOTO( t1, "create tensor fail.", final );
 
-            if( dst_dtype.vx_type != t1->attr.dtype.vx_type
+            if ( dst_dtype.vx_type != t1->attr.dtype.vx_type
                 && dst_dtype.qnt_type != t1->attr.dtype.qnt_type )
             {
                 p->local.tensors[LSTMUNIT_ACT_TENSOR_LN_WI + i] =
                     vsi_nn_ConvertTensorDtype( self->graph, t1, &dst_dtype );
-                vsi_nn_ReleaseTensor( &t1 );
+                vsi_safe_release_tensor(t1);
             }
             else
             {
@@ -226,7 +230,9 @@ static vsi_bool op_setup
         outputs[LSTMUNIT_ACT_HSTATE_OUT]->attr.size[3] = outputs[LSTMUNIT_ACT_OUTPUT]->attr.size[3];
     }
 
-    return TRUE;
+    ret = TRUE;
+final:
+    return ret;
 } /* op_setup() */
 
 static vsi_status op_deinit
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c
index 23b987d..9df9c1b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_maxpoolwithargmax.c
@@ -184,15 +184,13 @@ static vsi_bool op_setup
     if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         ret = vsi_nn_OpSetup( VSI_NN_OP_POOL, self, inputs, outputs );
-
     }
     if ( VSI_NN_DIM_AUTO == outputs[1]->attr.dim_num )
     {
-    outputs[1]->attr.dim_num = outputs[0]->attr.dim_num;
-    memcpy( outputs[1]->attr.size, outputs[0]->attr.size,
-        VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) );
+        outputs[1]->attr.dim_num = outputs[0]->attr.dim_num;
+        memcpy( outputs[1]->attr.size, outputs[0]->attr.size,
+            VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) );
     }
-
     return ret;
 } /* op_setup() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c
index c1d35eb..2c7dba9 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c
@@ -138,6 +138,7 @@ static vsi_status op_compute
     vsi_status status;
     vx_nn_pad_params_t p;
     vsi_nn_tensor_t *convert_tensor = NULL;
+    vsi_bool release_intermediate_tensor = TRUE;
 
     status = VSI_FAILURE;
     if (VSI_SUCCESS != vsi_nn_InitPadParameter(self, &p))
@@ -164,8 +165,8 @@ static vsi_status op_compute
     }
     else
     {
-        convert_tensor = vsi_nn_reshape_tensor( self->graph,
-            inputs[0], inputs[0]->attr.size, inputs[0]->attr.dim_num );
+        convert_tensor = inputs[0];
+        release_intermediate_tensor = FALSE;
     }
     self->n = vxTensorPadNode(
         self->graph->g,
@@ -182,7 +183,10 @@ static vsi_status op_compute
 
 final:
     vsi_nn_DeinitPadParameter(&p);
-    vsi_safe_release_tensor(convert_tensor);
+    if (release_intermediate_tensor)
+    {
+        vsi_safe_release_tensor(convert_tensor);
+    }
 
     return status;
 } /* op_compute() */
@@ -266,7 +270,7 @@ static vsi_bool op_setup
             if (front + back + inputs[0]->attr.size[i] != outputs[0]->attr.size[i])
             {
                 VSILOGE("Error:output shape[%u] not equal front padding[%u] + input shape[%u] + back padding[%u]",
-                    outputs[0]->attr.size[i], front, back);
+                    outputs[0]->attr.size[i], front, inputs[0]->attr.size[i], back);
                 return FALSE;
             }
         }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c
index bd01a72..8f6227e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pad2.c
@@ -45,31 +45,6 @@ typedef struct _pad2_local_data_t {
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
 
-static int32_t _get_vx_pad_mode(vx_enum mode)
-{
-    int32_t pad_mode = 0;
-    switch (mode)
-    {
-    case VSI_NN_PAD_MODE_CONSTANT:
-        pad_mode = VX_PAD_CONSTANT;
-        break;
-    case VSI_NN_PAD_MODE_REPLICATE:
-        pad_mode = VX_PAD_REPLICATE;
-        break;
-    case VSI_NN_PAD_MODE_SYMMETRIC:
-        pad_mode = VX_PAD_MIRROR_SYMMETRIC;
-        break;
-    case VSI_NN_PAD_MODE_REFLECT:
-        pad_mode = VX_PAD_MIRROR_REFLECT;
-        break;
-    default:
-        VSILOGE("Wrong pad_mode value");
-        break;
-    }
-
-    return pad_mode;
-}
-
 static int32_t _check_mirror_pad_size
     (
     vx_enum mode,
@@ -122,7 +97,7 @@ static vsi_status op_compute
     vsi_status status = VSI_FAILURE;
     vsi_nn_pad2_param *p = &self->nn_param.pad2;
     vsi_nn_kernel_param_t * param;
-    int32_t pad_mode = _get_vx_pad_mode(p->mode);
+    int32_t pad_mode = vsi_nn_get_vx_pad_mode(p->mode);
 
     param = vsi_nn_kernel_param_create();
 
@@ -230,7 +205,7 @@ static vsi_bool op_setup
             if (front + back + inputs[0]->attr.size[i] != outputs[0]->attr.size[i])
             {
                 VSILOGE("Error:output shape[%u] not equal front padding[%u] + input shape[%u] + back padding[%u]",
-                    outputs[0]->attr.size[i], front, back);
+                    outputs[0]->attr.size[i], front, inputs[0]->attr.size[i], back);
                 return FALSE;
             }
         }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c
index eadb94a..38409d6 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c
@@ -141,7 +141,7 @@ static vsi_status op_optimize
     char tensor_name[128];
 
     dim = inputs[0]->attr.dim_num;
-    if(FALSE == _is_pool1d(self, inputs))
+    if (FALSE == _is_pool1d(self, inputs))
     {
         return VSI_SUCCESS;
     }
@@ -155,9 +155,9 @@ static vsi_status op_optimize
     {
         /* reshape 3d input (xcn) --> 4d input (whcn) */
         shape[0] = inputs[0]->attr.size[0];//width
-        shape[1] = 1;//height
-        shape[2] = inputs[0]->attr.size[1];
-        shape[3] = inputs[0]->attr.size[2];
+        shape[1] = inputs[0]->attr.size[1];
+        shape[2] = inputs[0]->attr.size[2];
+        shape[3] = 1;//batch
         dim = 4;
         local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, dim);
     }
@@ -165,9 +165,9 @@ static vsi_status op_optimize
     {
         /* reshape 3d output(xcn) --> 4d output(whcn) */
         shape[0] = outputs[0]->attr.size[0];//width
-        shape[1] = 1;//height
-        shape[2] = outputs[0]->attr.size[1];
-        shape[3] = outputs[0]->attr.size[2];
+        shape[1] = outputs[0]->attr.size[1];
+        shape[2] = outputs[0]->attr.size[2];
+        shape[3] = 1;//batch
         dim = 4;
         local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, dim);
         if (local->reshaped_output && local->reshaped_output->t)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
index f913afd..c7f47af 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
@@ -94,8 +94,11 @@ static vsi_bool op_setup
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12          ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB           ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_BGRA          ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_GRAY          ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR ||
-         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422       ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422
         )
     {
         uint32_t i = 0;
@@ -160,7 +163,14 @@ static vsi_bool op_setup
             curr->node->nn_param.pre_process_gray.output_attr.dim_num = p->output_attr.dim_num;
 
             curr->inputs[0] = inputs[PRE_PROCESS_INPUT0];
-            curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
+            if (layout == VSI_NN_DEST_LAYOUT_NHWC)
+            {
+                curr->outputs[0] = preprocess_tensor->t;
+            }
+            else
+            {
+                curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
+            }
 
             vsi_nn_internal_setup_node(self, curr);
         }
@@ -470,6 +480,57 @@ static vsi_bool op_setup
             vsi_nn_internal_setup_node(self, curr);
         }
         break;
+    case VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422:
+    case VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422:
+        {
+            curr = vsi_nn_internal_new_node( self, VSI_NN_OP_PRE_PROCESS_YUV422, 0, 0 );
+
+            if (p->reverse_channel)
+            {
+                curr->node->nn_param.pre_process_yuv422.r_mean = p->norm.mean[2];
+                curr->node->nn_param.pre_process_yuv422.g_mean = p->norm.mean[1];
+                curr->node->nn_param.pre_process_yuv422.b_mean = p->norm.mean[0];
+            }
+            else
+            {
+                curr->node->nn_param.pre_process_yuv422.r_mean = p->norm.mean[0];
+                curr->node->nn_param.pre_process_yuv422.g_mean = p->norm.mean[1];
+                curr->node->nn_param.pre_process_yuv422.b_mean = p->norm.mean[2];
+            }
+
+            if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422)
+            {
+                curr->node->nn_param.pre_process_yuv422.yuv422_type = 0;
+            }
+            else
+            {
+                curr->node->nn_param.pre_process_yuv422.yuv422_type = 1;
+            }
+
+            curr->node->nn_param.pre_process_yuv422.rgb_scale = p->norm.scale;
+            curr->node->nn_param.pre_process_yuv422.reverse_channel = p->reverse_channel;
+            curr->node->nn_param.pre_process_yuv422.rect.left = p->rect.left;
+            curr->node->nn_param.pre_process_yuv422.rect.top = p->rect.top;
+            curr->node->nn_param.pre_process_yuv422.rect.width = p->rect.width;
+            curr->node->nn_param.pre_process_yuv422.rect.height = p->rect.height;
+            curr->node->nn_param.pre_process_yuv422.output_attr.size = p->output_attr.size;
+            curr->node->nn_param.pre_process_yuv422.output_attr.dim_num = p->output_attr.dim_num;
+            curr->node->nn_param.pre_process_yuv422.perm = p->perm;
+            curr->node->nn_param.pre_process_yuv422.dim_num = p->dim_num;
+
+            curr->inputs[0] = inputs[PRE_PROCESS_INPUT0];
+            if (layout == VSI_NN_DEST_LAYOUT_NHWC)
+            {
+                curr->outputs[0] = preprocess_tensor->t;
+            }
+            else
+            {
+                curr->outputs[0] = outputs[PRE_PROCESS_OUTPUT];
+            }
+
+            vsi_nn_internal_setup_node(self, curr);
+        }
+        break;
     default:
         {
             VSILOGE( "Not support this type!(PRE_PROCESS)\n");
@@ -479,10 +540,13 @@ static vsi_bool op_setup
     }
 
     if ( p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420        ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUYV422       ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_UYVY422       ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444        ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12          ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB           ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_BGRA          ||
+         p->type == VSI_NN_SOURCE_FORMAT_IMAGE_GRAY          ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR ||
          p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP
         )
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c
index 176aabf..a60f446 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c
@@ -87,12 +87,12 @@ static vsi_bool op_check
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP)
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP)
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16)
-        IO_TYPE(D_U8, D_U8, D_U8|Q_ASYM)
-        IO_TYPE(D_U8, D_U8, D_I8|Q_DFP)
-        IO_TYPE(D_U8, D_U8, D_I16|Q_DFP)
-        IO_TYPE(D_U8, D_U8, D_F16)
+        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_SYM)
     END_IO_TYPE_DECL(PRE_PROCESS_NV12)
-    if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_NV12, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(PRE_PROCESS_NV12, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c
index d98910b..bcac93c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c
@@ -87,10 +87,10 @@ static vsi_bool op_check
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP)
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP)
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16)
-        IO_TYPE(D_U8, D_U8, D_U8, D_U8|Q_ASYM)
-        IO_TYPE(D_U8, D_U8, D_U8, D_I8|Q_DFP)
-        IO_TYPE(D_U8, D_U8, D_U8, D_I16|Q_DFP)
-        IO_TYPE(D_U8, D_U8, D_U8, D_F16)
+        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_SYM)
     END_IO_TYPE_DECL(PRE_PROCESS_YUV420)
     if(!VALIDATE_OP_IO_TYPES(PRE_PROCESS_YUV420, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv422.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv422.c
new file mode 100644
index 0000000..b9c4daf
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv422.c
@@ -0,0 +1,238 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+typedef struct _pre_process_yuv422_local_data_t {
+    int32_t placeholder;
+} pre_process_yuv422_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t * param = NULL;
+    vsi_nn_kernel_node_t    n = NULL;
+    param =vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_int32( param, "scale_x", self->nn_param.pre_process_yuv422.local->scale_x );
+    vsi_nn_kernel_param_add_int32( param, "scale_y", self->nn_param.pre_process_yuv422.local->scale_y );
+    vsi_nn_kernel_param_add_int32( param, "left", self->nn_param.pre_process_yuv422.rect.left );
+    vsi_nn_kernel_param_add_int32( param, "top", self->nn_param.pre_process_yuv422.rect.top );
+    vsi_nn_kernel_param_add_float32( param, "r_mean", self->nn_param.pre_process_yuv422.r_mean );
+    vsi_nn_kernel_param_add_float32( param, "g_mean", self->nn_param.pre_process_yuv422.g_mean );
+    vsi_nn_kernel_param_add_float32( param, "b_mean", self->nn_param.pre_process_yuv422.b_mean );
+    vsi_nn_kernel_param_add_float32( param, "rgb_scale", self->nn_param.pre_process_yuv422.rgb_scale );
+    vsi_nn_kernel_param_add_int32( param, "reverse", self->nn_param.pre_process_yuv422.reverse_channel );
+    vsi_nn_kernel_param_add_int32( param, "enable_perm", self->nn_param.pre_process_yuv422.local->enable_perm );
+    vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_yuv422.local->enable_copy );
+    vsi_nn_kernel_param_add_int32( param, "yuv422_type", self->nn_param.pre_process_yuv422.yuv422_type );
+    n = vsi_nn_kernel_selector( self->graph, "pre_process_yuv422", inputs, 1, outputs, 1, param );
+    if( n != NULL )
+    {
+        self->n = (vx_node)n;
+        status = VSI_SUCCESS;
+    }
+
+    if(param != NULL)
+    {
+        vsi_nn_kernel_param_release( &param );
+    }
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(PRE_PROCESS_YUV422, 1, 1)
+        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM, D_F16)
+        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM, D_I16|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM, D_I16|Q_SYM)
+    END_IO_TYPE_DECL(PRE_PROCESS_YUV422)
+    if (!VALIDATE_OP_IO_TYPES(PRE_PROCESS_YUV422, self, inputs, self->input.num, outputs, self->output.num)) {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    /* TODO: Add code to comput outputs' shape. */
+    vsi_nn_pre_process_yuv422_param * p = NULL;
+    uint32_t i = 0;
+    p = (vsi_nn_pre_process_yuv422_param *)&(self->nn_param.pre_process_yuv422);
+
+    if (p->rect.width == 0 || p->rect.height == 0)
+    {
+        VSILOGE("Image size cannot be zero !(PRE_PROCESS_YUV422)\n");
+        return FALSE;
+    }
+    else
+    {
+        for (i = 0; i < p->output_attr.dim_num; i++)
+        {
+            if (p->output_attr.size[i] == 0)
+            {
+                VSILOGE("output size cannot be zero!(PRE_PROCESS_YUV422)\n");
+                return FALSE;
+            }
+        }
+    }
+
+    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        if (p->output_attr.dim_num > 0)
+        {
+            for (i = 0; i < p->output_attr.dim_num; i++)
+            {
+                if (p->output_attr.size[i] == 0)
+                {
+                    VSILOGE("output size cannot be zero!(PRE_PROCESS_YUV422)\n");
+                    return FALSE;
+                }
+                else
+                {
+                    outputs[0]->attr.dim_num = p->output_attr.dim_num;
+                    outputs[0]->attr.size[i] = p->output_attr.size[i];
+                }
+            }
+        }
+        else
+        {
+            VSILOGE("output dim num cannot be zero!(PRE_PROCESS_YUV422)\n");
+            return FALSE;
+        }
+    }
+
+    p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[0]->attr.size[0]);
+    p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[0]->attr.size[1]);
+
+    p->local->enable_copy = ((p->local->scale_x == p->local->scale_y) && (p->local->scale_x == (1 << 15)));
+
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t* self
+    )
+{
+    if (self->nn_param.pre_process_yuv422.local != NULL)
+    {
+        uint32_t i = 0;
+        for (i = 0; i < _VSI_NN_PRE_PROCESS_YUV422_LOCAL_TENSOR_NUM; i++)
+        {
+            if (self->nn_param.pre_process_yuv422.local->local_tensor[i] != NULL)
+            {
+                vxReleaseTensor(&(self->nn_param.pre_process_yuv422.local->local_tensor[i]));
+                self->nn_param.pre_process_yuv422.local->local_tensor[i] = NULL;
+            }
+        }
+        free(self->nn_param.pre_process_yuv422.local);
+        self->nn_param.pre_process_yuv422.local = NULL;
+    }
+    vsi_nn_op_common_deinit(self);
+
+    return VSI_SUCCESS;
+} /* op_deinit() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    self->nn_param.pre_process_yuv422.local   =
+    (vsi_nn_pre_process_yuv422_lcl_data *)malloc(sizeof(vsi_nn_pre_process_yuv422_lcl_data));
+
+    if (NULL == self->nn_param.pre_process_yuv422.local)
+    {
+        return  VX_ERROR_NO_MEMORY;
+    }
+    memset(self->nn_param.pre_process_yuv422.local, 0, sizeof(vsi_nn_pre_process_yuv422_lcl_data));
+
+    return status;
+} /* op_init() */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ PRE_PROCESS_YUV422,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
index 5a37151..b5489bf 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
@@ -158,7 +158,7 @@ static vsi_bool _check_is_sp_supported_type
 {
     int32_t * axes = self->nn_param.reduce.local2->axes;
     int32_t axes_num = self->nn_param.reduce.local2->axes_num;
-    vsi_size_t shapes[4][VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_size_t shapes[4][VSI_NN_MAX_DIM_NUM] = { {0} };
     int32_t axis_in[VSI_NN_MAX_DIM_NUM] = {0};
     int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0};
     int32_t i = 0;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c
index ced3cd7..04c4271 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_mean_internal.c
@@ -57,7 +57,7 @@ static vsi_status op_compute
     int32_t * axis = self->nn_param.reduce_mean_internal.axis;
     int32_t axis_num = self->nn_param.reduce_mean_internal.axis_num;
     float scale = self->nn_param.reduce_mean_internal.scale;
-    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { {0} };
     int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0};
     vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
     uint32_t axis_size = 0;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relu1.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relu1.c
index a1ba17c..5d1c2d4 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_relu1.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relu1.c
@@ -31,6 +31,7 @@
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
 
 static vsi_status op_compute
     (
@@ -39,22 +40,16 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status;
-    status = VSI_FAILURE;
+    vsi_status status = VSI_SUCCESS;
+    vsi_nn_kernel_node_t n = NULL;
 
-    self->n = vxActivationLayer(
-        self->graph->g,
-        inputs[0]->t,
-        VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU1,
-        0,
-        0,
-        outputs[0]->t
-        );
-
-    if( NULL != self->n )
+    n = vsi_nn_kernel_selector( self->graph, "relu1", inputs, 1, outputs, 1, NULL );
+    if ( n == NULL )
     {
-        status = VSI_SUCCESS;
+        status = VSI_FAILURE;
     }
+    self->n = (vx_node)n;
+
     return status;
 } /* op_compute() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relu6.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relu6.c
index 9020e7d..c9fd754 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_relu6.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relu6.c
@@ -31,7 +31,7 @@
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
-
+#include "kernel/vsi_nn_kernel.h"
 
 static vsi_status op_compute
     (
@@ -40,22 +40,16 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status;
-    status = VSI_FAILURE;
+    vsi_status status = VSI_SUCCESS;
+    vsi_nn_kernel_node_t n = NULL;
 
-    self->n = vxActivationLayer(
-        self->graph->g,
-        inputs[0]->t,
-        VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU6,
-        0,
-        0,
-        outputs[0]->t
-        );
-
-    if( NULL != self->n )
+    n = vsi_nn_kernel_selector( self->graph, "relu6", inputs, 1, outputs, 1, NULL );
+    if ( n == NULL )
     {
-        status = VSI_SUCCESS;
+        status = VSI_FAILURE;
     }
+    self->n = (vx_node)n;
+
     return status;
 } /* op_compute() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relun.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relun.c
index 1cbf229..ea54ce4 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_relun.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relun.c
@@ -32,7 +32,7 @@
 #include "vsi_nn_log.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
-
+#include "kernel/vsi_nn_kernel.h"
 
 static vsi_status op_compute
     (
@@ -41,18 +41,18 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status = VSI_FAILURE;
+    vsi_status status = VSI_SUCCESS;
     float top = self->nn_param.relun.relu_clamp_top;
     float bottom = self->nn_param.relun.relu_clamp_bottom;
-    vsi_enum func = -1;
+    vsi_nn_kernel_node_t n = NULL;
 
-    if(top == 1 && bottom == -1)
+    if (top == 1 && bottom == -1)
     {
-        func = VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU1;
+        n = vsi_nn_kernel_selector( self->graph, "relu1", inputs, 1, outputs, 1, NULL );
     }
-    else if(top == 6)
+    else if (top == 6)
     {
-        func = VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RELU6;
+        n = vsi_nn_kernel_selector( self->graph, "relu6", inputs, 1, outputs, 1, NULL );
     }
     else
     {
@@ -60,19 +60,13 @@ static vsi_status op_compute
         return VSI_FAILURE;
     }
 
-    self->n = vxActivationLayer(
-        self->graph->g,
-        inputs[0]->t,
-        func,
-        0,
-        0,
-        outputs[0]->t
-        );
-
-    if( NULL != self->n )
+    if ( n == NULL )
     {
-        status = VSI_SUCCESS;
+        status = VSI_FAILURE;
     }
+
+    self->n = (vx_node)n;
+
     return status;
 } /* op_compute() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c
index 295b6ee..2a77c5c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c
@@ -34,7 +34,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
-#include "libnnext/vsi_nn_vxkernel.h"
+#include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
@@ -53,6 +53,7 @@ static vsi_status _create_local_tensor
     vsi_nn_repeat_lcl_data *local = self->nn_param.repeat.local;
     vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
     uint32_t i = 0;
+    vsi_status status = VSI_FAILURE;
 
     if (axis == -1)
     {
@@ -63,6 +64,7 @@ static vsi_status _create_local_tensor
         }
 
         local->reshaped_input = vsi_nn_reshape_tensor(self->graph, inputs[0], shape, 1);
+        CHECK_PTR_FAIL_GOTO( local->reshaped_input, "create tensor fail.", final );
 
         shape[0] = 1;
         for(i = 0; i < outputs[0]->attr.dim_num; i++)
@@ -70,6 +72,7 @@ static vsi_status _create_local_tensor
             shape[0] *= outputs[0]->attr.size[i];
         }
         local->reshaped_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shape, 1);
+        CHECK_PTR_FAIL_GOTO( local->reshaped_output, "create tensor fail.", final );
     }
 
     if (repeat_host)
@@ -103,9 +106,12 @@ static vsi_status _create_local_tensor
         attr.dim_num = 2;
 
         local->repeat_tensor = vsi_nn_CreateTensorFromData(self->graph, (uint8_t*)repeat_host, &attr);
+        CHECK_PTR_FAIL_GOTO( local->repeat_tensor, "create tensor fail.", final );
     }
 
-    return VSI_SUCCESS;
+    status = VSI_SUCCESS;
+final:
+    return status;
 }
 
 static vsi_status op_compute
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
index 6ea0fc0..e1cfdaa 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
@@ -36,6 +36,7 @@
 #include "utils/vsi_nn_dtype_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
 
 VSI_NN_SUPPRESS_DEPRECATED_BEGIN
 
@@ -79,6 +80,7 @@ static vsi_status op_compute
         vsi_nn_tensor_t *tmp_tensor = NULL;
         tmp_tensor = vsi_nn_reshape_tensor( self->graph,
             outputs[0], inputs[0]->attr.size, inputs[0]->attr.dim_num );
+        CHECK_PTR_FAIL_GOTO( tmp_tensor, "create tensor fail.", final );
 
         self->n = vxTensorCopyNode(self->graph->g,
             inputs[0]->t, tmp_tensor->t);
@@ -88,7 +90,7 @@ static vsi_status op_compute
             status = VSI_FAILURE;
         }
         VSILOGD("Create a copy node for reshape");
-
+final:
         vsi_safe_release_tensor(tmp_tensor);
 #endif
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
index fd544a8..002b39b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
@@ -135,6 +135,16 @@ static vsi_status op_optimize
     }
     else
     {
+        int32_t half_pixel_centers = self->nn_param.resize.half_pixel_centers;
+        vsi_size_t * input_size = inputs[0]->attr.size;
+        vsi_size_t * output_size = outputs[0]->attr.size;
+
+        if ( (output_size[0] % input_size[0] == 0) && (output_size[1] % input_size[1] == 0) &&
+            half_pixel_centers == TRUE && self->nn_param.resize.type == VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR )
+        {
+            self->nn_param.resize.half_pixel_centers = FALSE;
+        }
+
         return VSI_SUCCESS;
     }
 } /* op_optimize() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c
index d5f3e54..282de4e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c
@@ -157,18 +157,13 @@ static vsi_bool op_setup
     vsi_bool use_virtual_tensor = TRUE;
     uint32_t kernel_h = 1;
     uint32_t kernel_w = 1;
-    vsi_bool ret = FALSE;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
     vsi_nn_internal_init_node_wksp( self );
-    p->local = (vsi_nn_rnncell_ovxlib_lcl_data_t*)
-        malloc(sizeof(vsi_nn_rnncell_ovxlib_lcl_data_t));
-    CHECK_PTR_FAIL_GOTO( p->local, "Create buffer fail.", final );
-    ret = TRUE;
 
     memset(p->local, 0x00, sizeof(vsi_nn_rnncell_ovxlib_lcl_data_t));
     memset(&attr, 0x00, sizeof(attr));
-    p->local->multi_batch = (vsi_bool)(inputs[RNNCELL_INPUT_INPUT]->attr.size[1]);
+    p->local->multi_batch = (inputs[RNNCELL_INPUT_INPUT]->attr.size[1]>1);
 
     if( inputs[RNNCELL_INPUT_INPUT]->attr.dtype.qnt_type
         != inputs[RNNCELL_INPUT_WEIGHT_I]->attr.dtype.qnt_type)
@@ -199,9 +194,6 @@ static vsi_bool op_setup
     {
         is_input_fc_on_tp = TRUE;
     }
-    /* TODO: now, all fc on tp because can't fetch the HW feature */
-    is_input_fc_on_tp = TRUE;
-    is_hstate_fc_on_tp = TRUE;
 
     setup_op_shapes(self, inputs, outputs);
 
@@ -212,7 +204,7 @@ static vsi_bool op_setup
         input_gate_fc_outputs = vsi_nn_rnn_create_tp_fc(self,
                                     inputs[RNNCELL_INPUT_INPUT],
                                     inputs[RNNCELL_INPUT_WEIGHT_I],
-                                    inputs[RNNCELL_INPUT_BIAS],
+                                    inputs[RNNCELL_INPUT_BIAS_I],
                                     &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_I],
                                     use_virtual_tensor);
         if (inputs[RNNCELL_INPUT_AUX_INPUT] != NULL)
@@ -237,7 +229,7 @@ static vsi_bool op_setup
         tmp = vsi_nn_rnn_create_nn_fc(self,
                 input_tensor->t,
                 inputs[RNNCELL_INPUT_WEIGHT_I],
-                inputs[RNNCELL_INPUT_BIAS],
+                inputs[RNNCELL_INPUT_BIAS_I],
                 kernel_h, kernel_w,
                 &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_I],
                 use_virtual_tensor);
@@ -273,7 +265,7 @@ static vsi_bool op_setup
         hstate_gate_fc_outputs = vsi_nn_rnn_create_tp_fc(self,
                                     inputs[RNNCELL_INPUT_H_STATE],
                                     inputs[RNNCELL_INPUT_WEIGHT_H],
-                                    NULL,
+                                    inputs[RNNCELL_INPUT_BIAS_H],
                                     &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_H],
                                     use_virtual_tensor);
     }
@@ -289,7 +281,7 @@ static vsi_bool op_setup
         tmp = vsi_nn_rnn_create_nn_fc(self,
                 hstate_input_tensor->t,
                 inputs[RNNCELL_INPUT_WEIGHT_H],
-                NULL,
+                inputs[RNNCELL_INPUT_BIAS_H],
                 kernel_h, kernel_w,
                 &p->internal_dtype[RNNCELL_QUANTIZE_PARAM_H],
                 use_virtual_tensor);
@@ -331,8 +323,7 @@ static vsi_bool op_setup
         vsi_nn_internal_setup_node(self, curr);
     }
 
-final:
-    return ret;
+    return TRUE;
 } /* op_setup() */
 
 static vsi_status op_deinit
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c
index 8121363..12668f0 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c
@@ -101,6 +101,7 @@ static vsi_bool op_check
         IO_TYPE(D_F32,  D_F32, D_F16)
         IO_TYPE(D_F32,  D_F32, D_F32)
         IO_TYPE(D_BF16, D_BF16, D_F32)
+        IO_TYPE(D_BF16, D_BF16, D_BF16)
         IO_TYPE(D_F32,  D_F32,  D_BF16)
 
         /* HW 9.0 */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c
index b4c8666..e6ba3bf 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c
@@ -34,6 +34,7 @@
 #include "vsi_nn_log.h"
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "kernel/vsi_nn_kernel.h"
 
 static vsi_status op_compute
     (
@@ -42,22 +43,16 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status;
-    status = VSI_FAILURE;
+    vsi_status status = VSI_SUCCESS;
+    vsi_nn_kernel_node_t n = NULL;
 
-    self->n = vxActivationLayer(
-        self->graph->g,
-        inputs[0]->t,
-        VX_CONVOLUTIONAL_NETWORK_ACTIVATION_RSQRT,
-        0,
-        0,
-        outputs[0]->t
-        );
-
-    if( NULL != self->n )
+    n = vsi_nn_kernel_selector( self->graph, "rsqrt", inputs, 1, outputs, 1, NULL );
+    if ( n == NULL )
     {
-        status = VSI_SUCCESS;
+        status = VSI_FAILURE;
     }
+    self->n = (vx_node)n;
+
     return status;
 } /* op_compute() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_elements.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_elements.c
new file mode 100644
index 0000000..99f8e40
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_elements.c
@@ -0,0 +1,171 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_math.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+typedef struct _scatter_elements_local_data_t {
+    int32_t placeholder;
+} scatter_elements_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (3)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t * param = NULL;
+    vsi_nn_scatter_elements_param * p = NULL;
+
+    if ( NULL == self )
+    {
+        return VSI_FAILURE;
+    }
+    status = VSI_FAILURE;
+
+    p = &(self->nn_param.scatter_elements);
+
+    // Add params
+    param = vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_int32( param, "axis", p->axis );
+    vsi_nn_kernel_param_add_int32( param, "reduction", p->reduction );
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+            "scatter_elements",
+            inputs, 3,
+            outputs, 1, param );
+
+    vsi_nn_kernel_param_release( &param );
+
+    if ( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(SCATTER_ELEMENTS, 3, 1)
+        IO_TYPE(D_I32,        D_I32,    D_I32,        D_I32)
+        IO_TYPE(D_F32,        D_I32,    D_F32,        D_F32)
+        IO_TYPE(D_F16,        D_I32,    D_F16,        D_F16)
+        IO_TYPE(D_BF16,       D_I32,    D_BF16,       D_BF16)
+        IO_TYPE(D_U8|Q_ASYM,  D_I32,    D_U8|Q_ASYM,  D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_DFP,   D_I32,    D_I8|Q_DFP,   D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,  D_I32,    D_I8|Q_ASYM,  D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,   D_I32,    D_I8|Q_SYM,   D_I8|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,  D_I32,    D_I16|Q_DFP,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM, D_I32,    D_I16|Q_ASYM, D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,  D_I32,    D_I16|Q_DFP,  D_I16|Q_SYM)
+    END_IO_TYPE_DECL(SCATTER_ELEMENTS)
+    if (!VALIDATE_OP_IO_TYPES(SCATTER_ELEMENTS, self, inputs, self->input.num, outputs, self->output.num))
+    {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    uint32_t i = 0;
+    uint32_t indices_dims = inputs[1]->attr.dim_num;
+
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        for (i = 0; i < inputs[0]->attr.dim_num; i++)
+        {
+            outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+        }
+    }
+
+    for (i = 0; i < indices_dims; i++)
+    {
+        if (inputs[1]->attr.size[i] != inputs[2]->attr.size[i])
+        {
+            VSILOGE("Indices vs updates dimensions differs at position=%d, %d vs %d", i,
+                inputs[1]->attr.size[i], inputs[2]->attr.size[i]);
+            return FALSE;
+        }
+    }
+
+    return TRUE;
+} /* op_setup() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ SCATTER_ELEMENTS,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_select.c b/src/tim/vx/internal/src/ops/vsi_nn_op_select.c
index c95d75e..94e0110 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_select.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_select.c
@@ -142,6 +142,29 @@ static vsi_bool op_check
         IO_TYPE(D_I8,        D_F16,         D_F16,          D_F16)
         IO_TYPE(D_I8,        D_I32,         D_I32,          D_I32)
         IO_TYPE(D_I8,        D_F32,         D_F32,          D_F32)
+        IO_TYPE(D_I8,        D_F16,         D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_I8,        D_F16,         D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8,        D_F16,         D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_I8,        D_F16,         D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_I8,        D_F16,         D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I8,        D_F16,         D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_I8,        D_F16,         D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_I8,        D_U8|Q_ASYM,   D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_I8,        D_I8|Q_DFP,    D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_I8,        D_I8|Q_ASYM,   D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_I8,        D_I8|Q_SYM,    D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_I8,        D_I16|Q_DFP,   D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_I8,        D_I16|Q_ASYM,  D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_I8,        D_I16|Q_SYM,   D_F16,          D_I16|Q_SYM)
+        IO_TYPE(D_I8,        D_I8|Q_DFP,    D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I8,        D_I8|Q_ASYM,   D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8,        D_I8|Q_SYM,    D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_I8,        D_U8|Q_ASYM,   D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8,        D_I16|Q_DFP,   D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_I8,        D_I16|Q_ASYM,  D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_I8,        D_I16|Q_SYM,   D_I16|Q_SYM,    D_F16)
+        IO_TYPE(D_I8,        D_BF16,        D_BF16,         D_BF16)
+
         IO_TYPE(D_BOOL8,     D_I8|Q_DFP,    D_I8|Q_DFP,     D_I8|Q_DFP)
         IO_TYPE(D_BOOL8,     D_I8|Q_ASYM,   D_I8|Q_ASYM,    D_I8|Q_ASYM)
         IO_TYPE(D_BOOL8,     D_I8|Q_SYM,    D_I8|Q_SYM,     D_I8|Q_SYM)
@@ -155,7 +178,7 @@ static vsi_bool op_check
         IO_TYPE(D_BOOL8,     D_F16,         D_I8|Q_SYM,     D_F16)
         IO_TYPE(D_BOOL8,     D_F16,         D_I16|Q_DFP,    D_F16)
         IO_TYPE(D_BOOL8,     D_F16,         D_I16|Q_ASYM,   D_F16)
-        IO_TYPE(D_BOOL8,     D_I16|Q_SYM,   D_F16,          D_F16)
+        IO_TYPE(D_BOOL8,     D_F16,         D_I16|Q_SYM,    D_F16)
         IO_TYPE(D_BOOL8,     D_U8|Q_ASYM,   D_F16,          D_F16)
         IO_TYPE(D_BOOL8,     D_I8|Q_DFP,    D_F16,          D_F16)
         IO_TYPE(D_BOOL8,     D_I8|Q_ASYM,   D_F16,          D_F16)
@@ -170,6 +193,28 @@ static vsi_bool op_check
         IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,   D_U8|Q_ASYM,    D_U8|Q_ASYM)
         IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP,   D_I16|Q_DFP,    D_I16|Q_DFP)
         IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,    D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_BOOL8,     D_F16,         D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_BOOL8,     D_F16,         D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_BOOL8,     D_F16,         D_I8|Q_ASYM,    D_I8|Q_ASYM)
+        IO_TYPE(D_BOOL8,     D_F16,         D_I8|Q_SYM,     D_I8|Q_SYM)
+        IO_TYPE(D_BOOL8,     D_F16,         D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_BOOL8,     D_F16,         D_I16|Q_ASYM,   D_I16|Q_ASYM)
+        IO_TYPE(D_BOOL8,     D_F16,         D_I16|Q_SYM,    D_I16|Q_SYM)
+        IO_TYPE(D_BOOL8,     D_U8|Q_ASYM,   D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_BOOL8,     D_I8|Q_DFP,    D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_BOOL8,     D_I8|Q_ASYM,   D_F16,          D_I8|Q_ASYM)
+        IO_TYPE(D_BOOL8,     D_I8|Q_SYM,    D_F16,          D_I8|Q_SYM)
+        IO_TYPE(D_BOOL8,     D_I16|Q_DFP,   D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_BOOL8,     D_I16|Q_ASYM,  D_F16,          D_I16|Q_ASYM)
+        IO_TYPE(D_BOOL8,     D_I16|Q_SYM,   D_F16,          D_I16|Q_SYM)
+        IO_TYPE(D_BOOL8,     D_I8|Q_DFP,    D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_BOOL8,     D_I8|Q_ASYM,   D_I8|Q_ASYM,    D_F16)
+        IO_TYPE(D_BOOL8,     D_I8|Q_SYM,    D_I8|Q_SYM,     D_F16)
+        IO_TYPE(D_BOOL8,     D_U8|Q_ASYM,   D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_BOOL8,     D_I16|Q_DFP,   D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_BOOL8,     D_I16|Q_ASYM,  D_I16|Q_ASYM,   D_F16)
+        IO_TYPE(D_BOOL8,     D_I16|Q_SYM,   D_I16|Q_SYM,    D_F16)
+        IO_TYPE(D_BOOL8,     D_BF16,        D_BF16,         D_BF16)
     END_IO_TYPE_DECL(SELECT)
     if (!VALIDATE_OP_IO_TYPES(SELECT, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softrelu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softrelu.c
index bf7566c..d9c0246 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_softrelu.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softrelu.c
@@ -32,7 +32,7 @@
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
-
+#include "kernel/vsi_nn_kernel.h"
 
 static vsi_status op_compute
     (
@@ -41,22 +41,16 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status;
-    status = VSI_FAILURE;
+    vsi_status status = VSI_SUCCESS;
+    vsi_nn_kernel_node_t n = NULL;
 
-    self->n = vxActivationLayer(
-        self->graph->g,
-        inputs[0]->t,
-        VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SOFTRELU,
-        0,
-        0,
-        outputs[0]->t
-        );
-
-    if( NULL != self->n )
+    n = vsi_nn_kernel_selector( self->graph, "softrelu", inputs, 1, outputs, 1, NULL );
+    if ( n == NULL )
     {
-        status = VSI_SUCCESS;
+        status = VSI_FAILURE;
     }
+    self->n = (vx_node)n;
+
     return status;
 } /* op_compute() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_space2batch.c b/src/tim/vx/internal/src/ops/vsi_nn_op_space2batch.c
index 599c78a..a15c4dc 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_space2batch.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_space2batch.c
@@ -34,6 +34,7 @@
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_dtype_util.h"
 #include "utils/vsi_nn_math.h"
+#include "vsi_nn_error.h"
 #include "utils/vsi_nn_constraint_check.h"
 
 static vsi_status op_compute
@@ -47,7 +48,37 @@ static vsi_status op_compute
     vx_nn_reorg_params_ext_t param;
     vsi_nn_tensor_t *block_size_tensor = NULL;
     vsi_nn_tensor_t *pad_tensor = NULL;
+    vsi_nn_tensor_t *input_tensor = NULL;
+    vsi_nn_tensor_t *output_tensor = NULL;
     vsi_nn_tensor_attr_t attr;
+    vsi_bool need_release_tensor = TRUE;
+    int32_t block_size[2] = {1, 1};
+
+    block_size[0] = self->nn_param.space2batch.block_size[0];
+    if (vsi_nn_is_3d_tensor(inputs[0]))
+    {
+        vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{1}};
+        memcpy(shape[0], inputs[0]->attr.size, sizeof(shape[0]));
+        memcpy(shape[1], outputs[0]->attr.size, sizeof(shape[1]));
+        shape[0][3] = shape[0][2];
+        shape[0][2] = shape[0][1];
+        shape[0][1] = 1;
+        shape[1][3] = shape[1][2];
+        shape[1][2] = shape[1][1];
+        shape[1][1] = 1;
+
+        input_tensor = vsi_nn_reshape_tensor(self->graph, inputs[0], shape[0], 4);
+        CHECK_PTR_FAIL_GOTO( input_tensor, "craete tensor fail.", final );
+        output_tensor = vsi_nn_reshape_tensor(self->graph, outputs[0], shape[1], 4);
+        CHECK_PTR_FAIL_GOTO( output_tensor, "craete tensor fail.", final );
+    }
+    else
+    {
+        block_size[1] = self->nn_param.space2batch.block_size[1];
+        need_release_tensor = FALSE;
+        input_tensor = inputs[0];
+        output_tensor = outputs[0];
+    }
 
     memset(&param, 0, sizeof(vx_nn_reorg_params_t));
     memset(&attr, 0, sizeof(attr));
@@ -58,13 +89,9 @@ static vsi_status op_compute
     attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
     block_size_tensor = vsi_nn_CreateTensorFromData(
         self->graph,
-        (uint8_t *)self->nn_param.space2batch.block_size,
+        (uint8_t *)block_size,
         &attr);
-    if( NULL == block_size_tensor )
-    {
-        VSILOGE("Create block_size_tensor fail.(space2batch)");
-        return VSI_FAILURE;
-    }
+    CHECK_PTR_FAIL_GOTO( block_size_tensor, "craete tensor fail.", final );
 
     memset(&attr, 0, sizeof(attr));
     attr.size[0] = 4;
@@ -76,31 +103,32 @@ static vsi_status op_compute
         self->graph,
         (uint8_t *)self->nn_param.space2batch.pad,
         &attr);
-    if( NULL == pad_tensor )
-    {
-        VSILOGE("Create pad_tensor fail.(space2batch)");
-        vsi_nn_ReleaseTensor(&block_size_tensor);
-        block_size_tensor = NULL;
-        return VSI_FAILURE;
-    }
+    CHECK_PTR_FAIL_GOTO( pad_tensor, "craete tensor fail.", final );
 
-    self->nn_param.space2batch.local.block_size_tensor = block_size_tensor;
-    self->nn_param.space2batch.local.pad_tensor = pad_tensor;
     param.base.block_size = REQUIRED_IO(block_size_tensor);
     param.pad = OPTIONAL_IO(pad_tensor);
     param.base.type = VX_REORG_SPACE_TO_BATCH_ND;
 
     self->n = vxReorgLayer2( self->graph->g,
-        inputs[0]->t,
+        input_tensor->t,
         (vx_nn_reorg_params_t *)&param,
         sizeof(vx_nn_reorg_params_ext_t),
-        outputs[0]->t);
+        output_tensor->t);
 
-    if( NULL != self->n )
+    if ( NULL != self->n )
     {
         status = VSI_SUCCESS;
     }
 
+final:
+    if (need_release_tensor)
+    {
+        vsi_safe_release_tensor(input_tensor);
+        vsi_safe_release_tensor(output_tensor);
+    }
+    vsi_safe_release_tensor(block_size_tensor);
+    vsi_safe_release_tensor(pad_tensor);
+
     return status;
 } /* op_compute() */
 
@@ -113,14 +141,13 @@ static vsi_bool op_check
 {
     vsi_bool ret = FALSE;
 
-    if (inputs[0]->attr.dim_num != 4)
+    if (inputs[0]->attr.dim_num < 3)
     {
-        VSILOGE("The input tensor shape must be 4-D!(space2batch)");
+        VSILOGE("The input tensor shape must be 3D or 4D!(space2batch)");
         return FALSE;
     }
 
-    if(self->nn_param.space2batch.block_size[0] < 0
-        || self->nn_param.space2batch.block_size[1] < 0
+    if (self->nn_param.space2batch.block_size[0] < 0
         || self->nn_param.space2batch.pad[0] < 0
         || self->nn_param.space2batch.pad[1] < 0
         || self->nn_param.space2batch.pad[2] < 0
@@ -145,38 +172,45 @@ static vsi_bool op_setup
     vsi_nn_space2batch_param * p;
     p = (vsi_nn_space2batch_param *)&(self->nn_param.space2batch);
 
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
-        outputs[0]->attr.size[3] =
-            inputs[0]->attr.size[3] * p->block_size[0] * p->block_size[1];
-        outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
-        outputs[0]->attr.size[1] =
-            (p->pad[2] + p->pad[3] + inputs[0]->attr.size[1]) / p->block_size[1];
-        outputs[0]->attr.size[0] =
-            (p->pad[0] + p->pad[1] + inputs[0]->attr.size[0]) / p->block_size[0];
-        outputs[0]->attr.dim_num = 4;
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+
+        if (vsi_nn_is_3d_tensor(inputs[0]))
+        {
+            outputs[0]->attr.size[2] =
+                inputs[0]->attr.size[2] * p->block_size[0];
+            outputs[0]->attr.size[1] = inputs[0]->attr.size[1];
+            outputs[0]->attr.size[0] =
+                (p->pad[0] + p->pad[1] + inputs[0]->attr.size[0]) / p->block_size[0];
+        }
+        else
+        {
+            outputs[0]->attr.size[3] =
+                inputs[0]->attr.size[3] * p->block_size[0] * p->block_size[1];
+            outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
+            outputs[0]->attr.size[1] =
+                (p->pad[2] + p->pad[3] + inputs[0]->attr.size[1]) / p->block_size[1];
+            outputs[0]->attr.size[0] =
+                (p->pad[0] + p->pad[1] + inputs[0]->attr.size[0]) / p->block_size[0];
+        }
     }
 
     return TRUE;
 } /* op_setup() */
 
-static vsi_status op_deinit
+static vsi_status op_init
     (
     vsi_nn_node_t * self
     )
 {
-    if (self->nn_param.space2batch.local.block_size_tensor != NULL)
-    {
-        vsi_nn_ReleaseTensor(&(self->nn_param.space2batch.local.block_size_tensor));
-    }
-    if (self->nn_param.space2batch.local.pad_tensor != NULL)
-    {
-        vsi_nn_ReleaseTensor(&(self->nn_param.space2batch.local.pad_tensor));
-    }
-    vsi_nn_op_common_deinit(self);
+    vsi_status status = VSI_SUCCESS;
+    vsi_nn_space2batch_param *p = &self->nn_param.space2batch;
 
-    return VSI_SUCCESS;
-} /* op_deinit() */
+    memset(p->pad, 0, sizeof(p->pad));
+
+    return status;
+} /* op_init() */
 
 #ifdef __cplusplus
 extern "C" {
@@ -185,9 +219,9 @@ extern "C" {
 DEF_OP_REG
     (
     /* op_name    */ SPACE2BATCH,
-    /* init       */ NULL,
+    /* init       */ op_init,
     /* compute    */ op_compute,
-    /* deinit     */ op_deinit,
+    /* deinit     */ vsi_nn_op_common_deinit,
     /* check      */ op_check,
     /* setup      */ op_setup,
     /* optimize   */ NULL,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_sqrt.c b/src/tim/vx/internal/src/ops/vsi_nn_op_sqrt.c
index e711b48..5ae3844 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_sqrt.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_sqrt.c
@@ -32,7 +32,7 @@
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
-
+#include "kernel/vsi_nn_kernel.h"
 
 static vsi_status op_compute
     (
@@ -41,22 +41,16 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status;
-    status = VSI_FAILURE;
+    vsi_status status = VSI_SUCCESS;
+    vsi_nn_kernel_node_t n = NULL;
 
-    self->n = vxActivationLayer(
-        self->graph->g,
-        inputs[0]->t,
-        VX_CONVOLUTIONAL_NETWORK_ACTIVATION_SQRT,
-        0,
-        0,
-        outputs[0]->t
-        );
-
-    if( NULL != self->n )
+    n = vsi_nn_kernel_selector( self->graph, "sqrt", inputs, 1, outputs, 1, NULL );
+    if ( n == NULL )
     {
-        status = VSI_SUCCESS;
+        status = VSI_FAILURE;
     }
+    self->n = (vx_node)n;
+
     return status;
 } /* op_compute() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c b/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c
index 5fe93f7..3609aad 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c
@@ -88,7 +88,7 @@ static vsi_bool op_check
         }
     }
 
-    ret = vsi_nn_OpCheck(VSI_NN_OP_RSQRT, self, inputs, outputs);
+    ret = vsi_nn_OpCheck(VSI_NN_OP_DATACONVERT, self, inputs, outputs);
 
     return ret;
 } /* op_check() */
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c
index fad9cbc..bf12b96 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c
@@ -53,6 +53,7 @@ static vsi_bool setup_op_shapes
     vsi_size_t num_units =  0;
     vsi_size_t output_size = 0;
     vsi_size_t batch_size = 0;
+    vsi_bool use_virtual_tensor = TRUE;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
 
@@ -82,6 +83,17 @@ static vsi_bool setup_op_shapes
         inputs[RNN_INPUT_H_STATE] = output_tensor->t;
     }
 
+    if( !outputs[RNN_OUTPUT_H_STATE] )
+    {
+        memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+        attr.dim_num = VSI_NN_DIM_AUTO;
+        memcpy( &attr.dtype, &outputs[RNN_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) );
+        attr.vtl = use_virtual_tensor;
+        attr.is_const = FALSE;
+        output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
+        outputs[RNN_OUTPUT_H_STATE] = output_tensor->t;
+    }
+
     /* output */
     if( VSI_NN_DIM_AUTO == outputs[RNN_OUTPUT_OUTPUT]->attr.dim_num )
     {
@@ -91,6 +103,14 @@ static vsi_bool setup_op_shapes
         outputs[RNN_OUTPUT_OUTPUT]->attr.dim_num = 3;
     }
 
+    /* output_state_out */
+    if( VSI_NN_DIM_AUTO == outputs[RNN_OUTPUT_H_STATE]->attr.dim_num )
+    {
+        outputs[RNN_OUTPUT_H_STATE]->attr.size[0] = output_size;
+        outputs[RNN_OUTPUT_H_STATE]->attr.size[1] = batch_size;
+        outputs[RNN_OUTPUT_H_STATE]->attr.dim_num = 2;
+    }
+
     return TRUE;
 }
 
@@ -207,7 +227,7 @@ static vsi_bool op_setup
 
         /* rnncell output h_state */
         vsi_nn_internal_init_tensor_attr(&attr,
-            &outputs[RNNCELL_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor);
+            &outputs[RNN_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor);
         output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
         rnncell_out1 = output_tensor->t;
 
@@ -221,8 +241,8 @@ static vsi_bool op_setup
         curr->inputs[RNNCELL_INPUT_WEIGHT_I] = inputs[RNN_INPUT_WEIGHT_I];
         curr->inputs[RNNCELL_INPUT_WEIGHT_H] = inputs[RNN_INPUT_WEIGHT_H];
 
-        curr->inputs[RNNCELL_INPUT_BIAS] = inputs[RNN_INPUT_BIAS];
-
+        curr->inputs[RNNCELL_INPUT_BIAS_I] = inputs[RNN_INPUT_BIAS_I];
+        curr->inputs[RNNCELL_INPUT_BIAS_H] = inputs[RNN_INPUT_BIAS_H];
         curr->outputs[RNNCELL_OUTPUT_OUTPUT] = rnncell_out0;
         curr->outputs[RNNCELL_OUTPUT_H_STATE] = rnncell_out1;
 
@@ -246,6 +266,14 @@ static vsi_bool op_setup
         tensor = output_tensor->t;
     }
 
+    if (outputs[RNN_OUTPUT_H_STATE] != NULL)
+    {
+        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
+        curr->inputs[0] = last_step_h_state;
+        curr->outputs[0] = outputs[RNN_OUTPUT_H_STATE];
+        vsi_nn_internal_setup_node(self, curr);
+    }
+
     /* concat rnncell output, the rnn's output is 3-dims */
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 );
     curr->node->nn_param.concat.axis = 2;
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
index b782511..46a5409 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
@@ -451,6 +451,10 @@ static _op_param_gen_t s_op_gen[] =
     /* CUMSUM */                NULL,
     /* MAXPOOLWITHARGMAX */     NULL,
     /* MOD */                   NULL,
+    /* LPPOOL */                NULL,
+    /* SCATTER_ELEMENTS */      NULL,
+    /* PRE_PROCESS_YUV422 */    NULL,
+    /* BUCKETIZE */             NULL,
 };
 _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );
 
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c b/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c
index f170bcf..95f5cc7 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c
@@ -45,6 +45,8 @@ static const char* _get_dtype_name(vsi_nn_type_e type)
     switch(type)
     {
         case D_NONE: return "Optional";
+        case D_I4: return "INT4";
+        case D_U4: return "UINT4";
         case D_I8: return "INT8";
         case D_I16: return "INT16";
         case D_I32: return "INT32";
@@ -73,6 +75,7 @@ static const char* _get_qtype_name(vsi_nn_qnt_type_e type)
         case VSI_NN_QNT_TYPE_NONE: return "";
         case VSI_NN_QNT_TYPE_DFP: return "DFP";
         case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC: return "ASYM";
+        case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC: return "SYM";
         case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC: return "SYMM PC";
         default:
             VSILOGE("Unknown quant type: %d\n", type);
@@ -234,14 +237,14 @@ char* generate_op_io_types_desc
     memset(desc, 0x00, sizeof(char) * total_sz);
 
     for(i = 0; i < inputs_num; i++) {
-        if(inputs[i]) {
+        if(inputs[i] && total_sz >= used_sz) {
             used_sz += snprintf(desc + used_sz, total_sz - used_sz, "%s %s, ",
                     _get_qtype_name(inputs[i]->attr.dtype.qnt_type),
                     _get_dtype_name(inputs[i]->attr.dtype.vx_type));
         }
     }
     for(i = 0; i < outputs_num; i++) {
-        if(outputs[i]) {
+        if(outputs[i] && total_sz >= used_sz) {
             used_sz += snprintf(desc + used_sz, total_sz - used_sz, "%s %s, ",
                     _get_qtype_name(outputs[i]->attr.dtype.qnt_type),
                     _get_dtype_name(outputs[i]->attr.dtype.vx_type));
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
index 2e6b26e..6f69616 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
@@ -338,20 +338,21 @@ vsi_bool vsi_nn_dtype_convert_float_to_quantize_symm_perchannel
     void * out_buffer
     )
 {
+    vsi_bool status;
     switch( dtype )
     {
         case I8:
-            vsi_nn_dtype_convert_float_to_quantize_symm8_perchannel(
-                    buffer, size, shape, rank,
-                    scale, scale_size, zero_point, zero_point_size,
-                    channel_dim, (int8_t*)out_buffer );
+            status = vsi_nn_dtype_convert_float_to_quantize_symm8_perchannel(
+                            buffer, size, shape, rank,
+                            scale, scale_size, zero_point, zero_point_size,
+                            channel_dim, (int8_t*)out_buffer );
             break;
         default:
             VSILOGE("Don't support convert float to symm perchannel quant %d.",
                     dtype);
-            return FALSE;
+            status = FALSE;
     }
-    return TRUE;
+    return status;
 } /* vsi_nn_dtype_convert_float_to_quantize_symm_perchannel() */
 
 vsi_bool vsi_nn_dtype_convert_dtype_to_float
@@ -496,17 +497,18 @@ vsi_bool vsi_nn_dtype_convert_quantize_symm_perchannel_to_float
     float * out_buffer
     )
 {
+    vsi_bool status;
     switch( dtype )
     {
         case I8:
-            vsi_nn_dtype_convert_quantize_symm8_perchannel_to_float(
-                    (const int8_t*)buffer, size, shape, rank,
-                    scale, scale_size, zero_point, zero_point_size,
-                    channel_dim, out_buffer );
+            status = vsi_nn_dtype_convert_quantize_symm8_perchannel_to_float(
+                            (const int8_t*)buffer, size, shape, rank,
+                            scale, scale_size, zero_point, zero_point_size,
+                            channel_dim, out_buffer );
             break;
         default:
             VSILOGE("Don't support convert symm perchannel quant %d to float.", dtype);
-            return FALSE;
+            status = FALSE;
     }
-    return TRUE;
+    return status;
 } /* vsi_nn_dtype_convert_quantize_symm_perchannel_to_float() */
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_util.c b/src/tim/vx/internal/src/utils/vsi_nn_util.c
index 25ffab7..21c8498 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_util.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c
@@ -201,6 +201,33 @@ static vsi_size_t _compute_padding
     return vsi_nn_max(padding, 0);
 } /* _compute_padding() */
 
+int32_t vsi_nn_get_vx_pad_mode
+    (
+    vsi_nn_pad_mode_e mode
+    )
+{
+    int32_t pad_mode = 0;
+    switch (mode) {
+        case VSI_NN_PAD_MODE_CONSTANT:
+            pad_mode = VX_PAD_CONSTANT;
+            break;
+        case VSI_NN_PAD_MODE_REPLICATE:
+            pad_mode = VX_PAD_REPLICATE;
+            break;
+        case VSI_NN_PAD_MODE_SYMMETRIC:
+            pad_mode = VX_PAD_MIRROR_SYMMETRIC;
+            break;
+        case VSI_NN_PAD_MODE_REFLECT:
+            pad_mode = VX_PAD_MIRROR_REFLECT;
+            break;
+        default:
+            VSILOGE("Wrong pad_mode value");
+            break;
+    }
+
+    return pad_mode;
+}
+
 uint8_t * vsi_nn_LoadBinaryData
     (
     const char * filename,
@@ -1486,3 +1513,18 @@ vsi_status vsi_nn_Unpack4bitData
     }
     return status;
 } /* vsi_nn_Unpack4bitData() */
+
+vsi_bool vsi_nn_is_3d_tensor
+    (
+    vsi_nn_tensor_t * tensor
+    )
+{
+    if (3 == tensor->attr.dim_num)
+    {
+        return TRUE;
+    }
+    else
+    {
+        return FALSE;
+    }
+}
diff --git a/src/tim/vx/internal/src/vsi_nn_context.c b/src/tim/vx/internal/src/vsi_nn_context.c
index cbddf2d..a8a99d4 100644
--- a/src/tim/vx/internal/src/vsi_nn_context.c
+++ b/src/tim/vx/internal/src/vsi_nn_context.c
@@ -63,7 +63,11 @@ static vsi_status query_hardware_caps
     context->config.support_stream_processor = paramExt.supportStreamProcessor;
     context->config.sp_exec_count = paramExt2.streamProcessorExecCount;
     context->config.sp_vector_depth = paramExt2.streamProcessorVectorSize;
-    context->config.sp_per_core_vector_depth = context->config.sp_vector_depth / context->config.sp_exec_count;
+    if (context->config.sp_exec_count > 0)
+    {
+        context->config.sp_per_core_vector_depth =
+            context->config.sp_vector_depth / context->config.sp_exec_count;
+    }
 #endif
 
 #endif
@@ -130,6 +134,13 @@ static vsi_status vsi_nn_initOptions
         options->enable_asymi8_to_u8 = atoi(env_s);
     }
 
+    env_s = NULL;
+    options->enable_dataconvert_optimize = 1;
+    if (vsi_nn_getEnv("VSI_NN_ENABLE_DATACONVERT_OPTIMIZE", &env_s) && env_s)
+    {
+        options->enable_dataconvert_optimize = atoi(env_s);
+    }
+
     return VSI_SUCCESS;
 }
 
diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c
index 535f595..cf44888 100644
--- a/src/tim/vx/internal/src/vsi_nn_graph.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph.c
@@ -39,6 +39,7 @@
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_vdata.h"
 #include "utils/vsi_nn_map.h"
+#include "utils/vsi_nn_dtype_util.h"
 #include "vsi_nn_graph_optimization.h"
 #include "vsi_nn_error.h"
 
@@ -2251,3 +2252,126 @@ vsi_bool vsi_nn_IsGraphFastMode
 {
     return NULL == graph ? FALSE : graph->isAllowFastMode;
 }
+
+vsi_status vsi_nn_CopyTensorViaGraphs
+    (
+    vsi_nn_graph_t *src_graph,
+    vsi_nn_tensor_id_t src_tensor_id,
+    vsi_nn_graph_t *dst_graph,
+    vsi_nn_tensor_id_t dst_tensor_id
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    uint8_t *data = NULL;
+    vsi_nn_tensor_t *src_tensor = NULL;
+    vsi_nn_tensor_t *dst_tensor = NULL;
+    vsi_size_t i;
+
+    src_tensor = vsi_nn_GetTensor(src_graph, src_tensor_id);
+    TEST_CHECK_PTR(src_tensor, final);
+    dst_tensor = vsi_nn_GetTensor(dst_graph, dst_tensor_id);
+    TEST_CHECK_PTR(dst_tensor, final);
+
+    /* Check shape and dtype */
+    if(src_tensor->attr.dim_num != dst_tensor->attr.dim_num)
+    {
+        VSILOGE("The dim_num of src_tensor and dst_tensor don't match.");
+        return status;
+    }
+    for(i=0; i<src_tensor->attr.dim_num; i++)
+    {
+        if(src_tensor->attr.size[i] != dst_tensor->attr.size[i])
+        {
+            VSILOGE("The shape of src_tensor and dst_tensor don't match.");
+            return status;
+        }
+    }
+    if(vsi_nn_DtypeCompare(&src_tensor->attr.dtype, &dst_tensor->attr.dtype) == FALSE)
+    {
+        VSILOGE("The dtype of src_tensor and dst_tensor don't match.");
+        return status;
+    }
+
+    data = vsi_nn_ConvertTensorToData(src_graph, src_tensor);
+    TEST_CHECK_PTR(data, final);
+
+    status = vsi_nn_CopyDataToTensor(dst_graph, dst_tensor, data);
+    TEST_CHECK_STATUS(status, final);
+
+final:
+    vsi_nn_safe_free(data);
+    return status;
+} /* vsi_nn_CopyTensorViaGraphs() */
+
+vsi_status vsi_nn_ExecuteGraphLoop
+    (
+    vsi_nn_graph_t *graph,
+    vsi_nn_tensor_t *max_iteration_tensor
+    )
+{
+    int32_t i,j,loop_var_num,max_iteration;
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_tensor_t *iteration_index = NULL;
+    vsi_nn_tensor_t *iteration_cond_out = NULL;
+    uint8_t *data = NULL;
+    int8_t cond = 0;
+    vsi_size_t sz = 0;
+
+    sz = vsi_nn_ShapeProduct(max_iteration_tensor->attr.size, max_iteration_tensor->attr.dim_num);
+    if(1 != sz) // it's shape should be 1.
+    {
+        VSILOGE("Invalid max_iteration_tensor.");
+        return status;
+    }
+
+    loop_var_num = graph->input.num - 2;
+    iteration_index = vsi_nn_GetTensor(graph, graph->input.tensors[0]);
+    iteration_cond_out = vsi_nn_GetTensor(graph, graph->output.tensors[0]);
+
+    data = vsi_nn_ConvertTensorToData(NULL, max_iteration_tensor);
+    TEST_CHECK_PTR(data, final);
+    max_iteration = ((int32_t *)data)[0];
+    vsi_nn_safe_free(data);
+
+    for(i=0; i<max_iteration; i++)
+    {
+        status = vsi_nn_CopyDataToTensor(graph, iteration_index, &i);
+        TEST_CHECK_STATUS(status, final);
+
+        status = vsi_nn_RunGraph(graph);
+        TEST_CHECK_STATUS(status, final);
+
+        /*
+            Loop Graph inputs: iteration_index, iteration_cond_in, loop_vars...
+            Loop Graph outputs: iteration_cond_out, loop_vars...
+        */
+        data = vsi_nn_ConvertTensorToData(graph, iteration_cond_out);
+        TEST_CHECK_PTR(data, final);
+        cond = ((int8_t *)data)[0];
+        vsi_nn_safe_free(data);
+        if(cond == FALSE)
+        {
+            break;
+        }
+
+        // Update condition
+        status = vsi_nn_CopyTensorViaGraphs(
+            graph, graph->output.tensors[0],
+            graph, graph->input.tensors[1]
+        );
+        TEST_CHECK_STATUS(status, final);
+        for(j=0; j<loop_var_num; j++)
+        {
+            // Update loop_vars
+            status = vsi_nn_CopyTensorViaGraphs(
+                graph, graph->output.tensors[j + 1],
+                graph, graph->input.tensors[j + 2]
+            );
+            TEST_CHECK_STATUS(status, final);
+        }
+    }
+
+final:
+    vsi_nn_safe_free(data);
+    return status;
+} /* vsi_nn_ExecuteGraphLoop() */
diff --git a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
index 7a0d809..855189b 100644
--- a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
@@ -574,7 +574,7 @@ static vx_tensor _create_const_raw_tensor
 
     if( TRUE == attr.is_created_from_handle )
     {
-        vx_tensor_addressing addr;
+        vx_tensor_addressing addr = NULL;
         vsi_size_t stride_size[VSI_NN_MAX_DIM_NUM];
         vsi_size_t buf_sz;
 
@@ -649,7 +649,15 @@ static vx_tensor _create_const_raw_tensor
                     addr, data, VX_MEMORY_TYPE_HOST);
 #endif
                 //memset(data, 0x5A, buf_sz);
-                vxReleaseTensorAddressing( &addr );
+                if (addr)
+                {
+                    vxReleaseTensorAddressing( &addr );
+                }
+                if ( NULL == tensor )
+                {
+                    VSILOGE( "Create vx tensor fail." );
+                    goto final;
+                }
                 vxFlushHandle( (vx_reference)tensor );
             }
         }
@@ -664,6 +672,8 @@ static vx_tensor _create_const_raw_tensor
         tensor = vxCreateVirtualTensor2( graph->g,
             &params, sizeof( vx_tensor_create_params_t ) );
     }
+
+final:
     if( NULL == tensor )
     {
         VSILOGE( "Create vx tensor fail." );
diff --git a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
index 1845bc7..0c870bc 100644
--- a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
+++ b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
@@ -198,6 +198,8 @@ static _node_template s_template[] =
     /* CUMSUM */                NULL,
     /* MAXPOOLWITHARGMAX */     NULL,
     /* MOD */                   NULL,
+    /* LPPOOL */                NULL,
+    /* PRE_PROCESS_YUV422 */    NULL,
 };
 //_compiler_assert( _cnt_of_array(s_template) == VSI_NN_OP_NUM, vsi_nn_node_attr_template_c );
 
diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c
index 54236c0..c931dd6 100644
--- a/src/tim/vx/internal/src/vsi_nn_tensor.c
+++ b/src/tim/vx/internal/src/vsi_nn_tensor.c
@@ -433,7 +433,7 @@ static vsi_bool _init_tensor
 #endif
     if( TRUE == tensor->attr.is_created_from_handle )
     {
-        vx_tensor_addressing addr;
+        vx_tensor_addressing addr = NULL;
         vsi_size_t stride_size[VSI_NN_MAX_DIM_NUM];
         vsi_size_t buf_sz;
 
@@ -529,7 +529,16 @@ static vsi_bool _init_tensor
 
 #endif
                 //memset(data, 0x5A, buf_sz);
-                vxReleaseTensorAddressing( &addr );
+                if (addr)
+                {
+                    vxReleaseTensorAddressing( &addr );
+                }
+
+                if ( NULL == tensor->t )
+                {
+                    ret = FALSE;
+                    goto final;
+                }
                 vxFlushHandle( (vx_reference)tensor->t );
             }
         }
@@ -544,10 +553,11 @@ static vsi_bool _init_tensor
         tensor->t = vxCreateVirtualTensor2( graph->g,
             &params, sizeof( vx_tensor_create_params_t ) );
     }
-    if( NULL == tensor->t )
+    if ( NULL == tensor->t )
     {
         VSILOGE( "Create vx tensor fail." );
         ret = FALSE;
+        goto final;
     }
 
     if( !tensor->attr.vtl && !tensor->attr.is_const )
@@ -565,6 +575,7 @@ static vsi_bool _init_tensor
 
     ret = _try_set_const_tensor( tensor );
 
+final:
     if( scales )
     {
         free(scales);
@@ -1243,6 +1254,11 @@ void vsi_nn_SaveTensorToTextByFp32
 
         count += snprintf( (char *)&buf[count], _TENSOR_TMPBUF_SZ - count,
             "%f%s", write_data, seperator );
+        if ( count > _TENSOR_TMPBUF_SZ )
+        {
+            VSILOGW( "tensor buffer overflow!" );
+            break;
+        }
         if( ((float)count / _TENSOR_TMPBUF_SZ) > c_flush_th )
         {
             fwrite( buf, count, 1, fp );
@@ -1335,11 +1351,21 @@ void vsi_nn_SaveDataToText
         {
             count += snprintf( (char *)&buf[count], _TENSOR_TMPBUF_SZ - count,
                 "%d%s", (int32_t)write_data, seperator );
+            if ( count > _TENSOR_TMPBUF_SZ )
+            {
+            VSILOGW( "tensor buffer overflow!" );
+            break;
+            }
         }
         else
         {
             count += snprintf( (char *)&buf[count], _TENSOR_TMPBUF_SZ - count,
                 "%f%s", write_data, seperator );
+            if ( count > _TENSOR_TMPBUF_SZ )
+            {
+            VSILOGW( "tensor buffer overflow!" );
+            break;
+            }
         }
         if( ((float) count / _TENSOR_TMPBUF_SZ ) > c_flush_th )
         {
@@ -1358,8 +1384,8 @@ void vsi_nn_SaveTensorToBinary
     const char       * filename
     )
 {
-    uint8_t        * data;
-    FILE            * fp;
+    uint8_t        * data = NULL;
+    FILE            * fp = NULL;
     vsi_size_t         sz;
     uint32_t         i;
     uint8_t        * packed_data = NULL;
@@ -1391,6 +1417,12 @@ void vsi_nn_SaveTensorToBinary
         packed_size = vsi_nn_GetTensorSize( tensor->attr.size, tensor->attr.dim_num,
                                                          tensor->attr.dtype.vx_type);
         packed_data = (uint8_t*)malloc(packed_size);
+        if ( NULL == packed_data )
+        {
+            VSILOGW( "malloc packed data failed" );
+            goto final;
+        }
+
         vsi_nn_Pack4bitData(tensor, data, packed_data);
         fwrite( packed_data, packed_size, 1, fp );
         if( packed_data )
@@ -1407,9 +1439,14 @@ void vsi_nn_SaveTensorToBinary
         }
         fwrite( data, sz, 1, fp );
     }
-    fclose( fp );
+
 final:
+    if (fp)
+    {
+        fclose( fp );
+    }
     vsi_nn_safe_free( data );
+    vsi_nn_safe_free( packed_data );
 } /* vsi_nn_SaveTensorToBinary() */
 
 vsi_nn_tensor_t * vsi_nn_CreateTensorFromData