diff --git a/prebuilt-sdk/x86_64_linux/VERSION b/prebuilt-sdk/x86_64_linux/VERSION
index 40da7fc..2524731 100644
--- a/prebuilt-sdk/x86_64_linux/VERSION
+++ b/prebuilt-sdk/x86_64_linux/VERSION
@@ -1 +1 @@
-REL/6.4.8
+REL/6.4.9
diff --git a/prebuilt-sdk/x86_64_linux/include/CL/cl_viv_vx_ext.h b/prebuilt-sdk/x86_64_linux/include/CL/cl_viv_vx_ext.h
index 02286d8..3b85e85 100644
--- a/prebuilt-sdk/x86_64_linux/include/CL/cl_viv_vx_ext.h
+++ b/prebuilt-sdk/x86_64_linux/include/CL/cl_viv_vx_ext.h
@@ -349,75 +349,74 @@ enum eVXC_ERROR
 #define VXC_OP1(Op, Dest, Src0)   _viv_asm(INTRINSIC, Dest, VXC_OP_##Op, Src0)
 
 #define VXC_OP2(Op, Dest, Src0, Src1)                  \
-    do {                                               \
+    {                                                  \
         int _t1;                                       \
-        _viv_asm(PARAM_CHAIN, _t1, Src0, Src1);         \
-        _viv_asm(INTRINSIC, Dest, VXC_OP_##Op, _t1);    \
-    } while(0)
+        _viv_asm(PARAM_CHAIN, _t1, Src0, Src1);        \
+        _viv_asm(INTRINSIC, Dest, VXC_OP_##Op, _t1);   \
+    }
 
 #define VXC_OP3(Op, Dest, Src0, Src1, Src2)            \
-    do {                                               \
+    {                                                  \
         int _t1, _t2;                                  \
         _viv_asm(PARAM_CHAIN, _t1, Src0, Src1);        \
         _viv_asm(PARAM_CHAIN, _t2, _t1, Src2);         \
         _viv_asm(INTRINSIC, Dest, VXC_OP_##Op, _t2);   \
-    } while(0)
+    }
 
 #define VXC_OP3_NoDest(Op, Src0, Src1, Src2)           \
-    do {                                               \
+    {                                                  \
         int _t1, _t2, _t3;                             \
         _viv_asm(PARAM_CHAIN, _t1, Src0, Src1);        \
         _viv_asm(PARAM_CHAIN, _t2, _t1, Src2);         \
         _viv_asm(INTRINSIC_ST, _t3, VXC_OP_##Op, _t2); \
-    } while(0)
-
+    }
 
 #define VXC_OP4(Op, Dest, Src0, Src1, Src2, Src3)      \
-    do {                                               \
+    {                                                  \
         int _t1, _t2, _t3;                             \
         _viv_asm(PARAM_CHAIN, _t1, Src0, Src1);        \
         _viv_asm(PARAM_CHAIN, _t2, _t1, Src2);         \
         _viv_asm(PARAM_CHAIN, _t3, _t2, Src3);         \
         _viv_asm(INTRINSIC, Dest, VXC_OP_##Op, _t3);   \
-    } while(0)
+    }
 
 #define VXC_OP4_NoDest(Op, Src0, Src1, Src2, Src3)     \
-    do {                                               \
+    {                                                  \
         int _t1, _t2, _t3, _t4;                        \
         _viv_asm(PARAM_CHAIN, _t1, Src0, Src1);        \
         _viv_asm(PARAM_CHAIN, _t2, _t1, Src2);         \
         _viv_asm(PARAM_CHAIN, _t3, _t2, Src3);         \
         _viv_asm(INTRINSIC_ST, _t4, VXC_OP_##Op, _t3); \
-    } while(0)
+    }
 
 #define VXC_OP4_ST(Op, Dest, Src0, Src1, Src2, Src3)   \
-    do {                                               \
+    {                                                  \
         int _t1, _t2, _t3;                             \
         _viv_asm(PARAM_CHAIN, _t1, Src0, Src1);        \
         _viv_asm(PARAM_CHAIN, _t2, _t1, Src2);         \
         _viv_asm(PARAM_CHAIN, _t3, _t2, Src3);         \
         _viv_asm(INTRINSIC_ST, Dest, VXC_OP_##Op, _t3);\
-    } while(0)
+    }
 
 #define VXC_OP5(Op, Dest, Src0, Src1, Src2, Src3, Src4)   \
-    do {                                                  \
+    {                                                     \
         int _t1, _t2, _t3, _t4;                           \
         _viv_asm(PARAM_CHAIN, _t1, Src0, Src1);           \
         _viv_asm(PARAM_CHAIN, _t2, _t1, Src2);            \
         _viv_asm(PARAM_CHAIN, _t3, _t2, Src3);            \
         _viv_asm(PARAM_CHAIN, _t4, _t3, Src4);            \
         _viv_asm(INTRINSIC, Dest, VXC_OP_##Op, _t4);      \
-    } while(0)
+    }
 
 #define VXC_OP5_NoDest(Op, Src0, Src1, Src2, Src3, Src4)  \
-    do {                                                  \
+    {                                                     \
         int _t1, _t2, _t3, _t4, _t5;                      \
         _viv_asm(PARAM_CHAIN, _t1, Src0, Src1);           \
         _viv_asm(PARAM_CHAIN, _t2, _t1, Src2);            \
         _viv_asm(PARAM_CHAIN, _t3, _t2, Src3);            \
         _viv_asm(PARAM_CHAIN, _t4, _t3, Src4);            \
         _viv_asm(INTRINSIC_ST, _t5, VXC_OP_##Op, _t4);    \
-    } while(0)
+    }
 
 /* make sure the immediate value offsetX and offsetY are in range of [-16, 15] */
 #define VXC_5BITOFFSET_XY(offsetX, offsetY)  ((((offsetY) & 0x1F) << 5) | ((offsetX) & 0x1F))
@@ -515,41 +514,34 @@ enum eVXC_ERROR
  * Offset should be composed by using VXC_5BITOFFSET_XY(x, y)
  * Coord must be type of int4 or float4 
  */
-#define VXC_ReadImage2DArray(Dest, Image, Coord, Offset, Info)         \
-    do {                                                               \
-       int8 desc;                                                      \
-       _viv_asm(COPY, desc, Image, sizeof(desc));                      \
-       _viv_asm(CLAMP0MAX, (Coord).w, (Coord).z, desc.s5 - 1);         \
-       int baseAddr =  (int)(Coord).w *desc.s4 + desc.s0;              \
-       _viv_asm(MOV, (Coord).w, baseAddr);                             \
-       VXC_OP4(img_load_3d, Dest, Image, (Coord).xyww, Offset, Info);  \
-    } while (0)
-#define VXC_WriteImage2DArray(Image, Coord, Color, Info)               \
-    do {                                                               \
-       int8 desc;                                                      \
-       _viv_asm(COPY, desc, Image, sizeof(desc));                      \
-       _viv_asm(CLAMP0MAX, (Coord).w, (Coord).z, desc.s5 - 1);         \
-       int baseAddr =  (int)(Coord).w *(desc).s4 + desc.s0;            \
-       _viv_asm(MOV, (Coord).w, baseAddr);                             \
-       VXC_OP4_NoDest(img_store_3d, Image, (Coord).xyww, Color, Info); \
-    } while (0)
+#define VXC_ReadImage2DArray(Dest, Image, OrigCoord, Offset, Info)          \
+    {                                                                       \
+       int8 desc;                                                           \
+       int4 tempCoord = (int4)(OrigCoord.xyzz);                             \
+       _viv_asm(COPY, desc, Image, sizeof(desc));                           \
+       _viv_asm(CLAMP0MAX, tempCoord.z, tempCoord.z, desc.s5 - 1);          \
+       tempCoord.z = tempCoord.z *desc.s4 + desc.s0;                        \
+       VXC_OP4(img_load_3d, Dest, Image, tempCoord, Offset, Info);          \
+    }
+#define VXC_WriteImage2DArray(Image, OrigCoord, Color, Info)                \
+    {                                                                       \
+       int8 desc;                                                           \
+       int4 tempCoord = (int4)(OrigCoord.xyzz);                             \
+       _viv_asm(COPY, desc, Image, sizeof(desc));                           \
+       _viv_asm(CLAMP0MAX, tempCoord.z, tempCoord.z, desc.s5 - 1);          \
+       tempCoord.z = tempCoord.z *desc.s4 + desc.s0;                        \
+       VXC_OP4_NoDest(img_store_3d, Image, tempCoord, Color, Info);         \
+    }
 
-/* image load/store for image3d_t, 
- * offset should be composed by using VXC_5BITOFFSET_XY(x, y)
- * Coord must be type of int4 or float4 
- */
-#define VXC_ReadImage3D(Dest, Image, Coord, Offset, Info)       VXC_OP4(img_read_3d, Dest, Image, Coord, Offset, Info)
-#define VXC_WriteImage3D(Image, Coord, Color, Info)             VXC_OP4_NoDest(img_write_3d, Image, Coord, Color, Info)
+#define VXC_Vload2(Dest, Pointer, Offset)    { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload2, Dest, Pointer, byteOffset); }
+#define VXC_Vload4(Dest, Pointer, Offset)    { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload4, Dest, Pointer,  byteOffset); }
+#define VXC_Vload8(Dest, Pointer, Offset)    { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload8, Dest, Pointer,  byteOffset); }
+#define VXC_Vload16(Dest, Pointer, Offset)   { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload16, Dest, Pointer,  byteOffset); }
 
-#define VXC_Vload2(Dest, Pointer, Offset)    do { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload2, Dest, Pointer, byteOffset); } while(0)
-#define VXC_Vload4(Dest, Pointer, Offset)    do { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload4, Dest, Pointer,  byteOffset); } while(0)
-#define VXC_Vload8(Dest, Pointer, Offset)    do { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload8, Dest, Pointer,  byteOffset); } while(0)
-#define VXC_Vload16(Dest, Pointer, Offset)   do { int byteOffset = ((int)sizeof((Dest)))*(Offset); VXC_OP2(vload16, Dest, Pointer,  byteOffset); } while(0)
-
-#define VXC_Vstore2(Pointer, Offset, Data)   do { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore2, Pointer, byteOffset, Data); } while(0)
-#define VXC_Vstore4(Pointer, Offset, Data)   do { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore4, Pointer, byteOffset, Data); } while(0)
-#define VXC_Vstore8(Pointer, Offset, Data)   do { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore8, Pointer, byteOffset, Data); } while(0)
-#define VXC_Vstore16(Pointer, Offset, Data)  do { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore16, Pointer, byteOffset, Data); } while(0)
+#define VXC_Vstore2(Pointer, Offset, Data)   { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore2, Pointer, byteOffset, Data); }
+#define VXC_Vstore4(Pointer, Offset, Data)   { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore4, Pointer, byteOffset, Data); }
+#define VXC_Vstore8(Pointer, Offset, Data)   { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore8, Pointer, byteOffset, Data); }
+#define VXC_Vstore16(Pointer, Offset, Data)  { int byteOffset = ((int)sizeof((Data)))*(Offset); VXC_OP3_NoDest(vstore16, Pointer, byteOffset, Data); }
 
 /* VX2 only instructions*/
 #define VXC_IndexAdd(Dest, Src0, Src1, Src2, Info)        VXC_OP4(index_add, Dest, Src0, Src1, Src2, Info)
@@ -562,7 +554,7 @@ enum eVXC_ERROR
 
 #if (VX_VERSION == 2)
 #define VXC_BiLinear(Dest, Src0, Src1, Src2, Info)                                      \
-    do {                                                                                \
+    {                                                                                   \
         int endBin = ((Info) & VXC_END_BIN_BITMASK) >> 8;                               \
         int roundMode = ((Info) & VXC_ROUNDING_MODE_BITMASK) >> 2;                      \
         int clamp = ((Info) & VXC_CLAMP_BITMASK) >> 22;                                 \
@@ -576,7 +568,7 @@ enum eVXC_ERROR
         _viv_asm(PARAM_CHAIN, bi4, bi3, 8);                                             \
         _viv_asm(INTRINSIC, bi2, OP_bit_extract, bi4);                                  \
         VXC_Lerp(Dest, bi2!<f:UCHAR>, bi2.y!<f:UCHAR>, (Src2).x, Info);                 \
-    }   while (0)
+    }
 
 #define VXC_BitReplace(Dest, Src0, Src1, Src2, Info)   /* BitReplace definition here */
 #define VXC_IAdd(Dest, Src0, Src1, Src2, Info)         /* IAdd definition here */
@@ -592,7 +584,8 @@ enum eVXC_ERROR
 #define VXC_Filter_Max(Dest, Src0, Src1, Src2, Info)        /* Max filter definition here */
 #define VXC_Filter_Min(Dest, Src0, Src1, Src2, Info)        /* Min filter definition here */
 #define VXC_Filter_Median(Dest, Src0, Src1, Src2, Info)     /* Median filter definition here */
-#define VXC_Filter(Dest, Src0, Src1, Src2, Info)       do {                                    \
+#define VXC_Filter(Dest, Src0, Src1, Src2, Info)                                               \
+    {                                                                                          \
         int filter = (((Info) >> 16)&0x0F);                                                    \
         if (filter == VXC_FM_BOX)       { VXC_Filter_Box(Dest, Src0, Src1, Src2, Info); }      \
         if (filter == VXC_FM_Guassian)  { VXC_Filter_Guassian(Dest, Src0, Src1, Src2, Info); } \
@@ -603,7 +596,7 @@ enum eVXC_ERROR
         if (filter == VXC_FM_Max)       { VXC_Filter_Max(Dest, Src0, Src1, Src2, Info); }      \
         if (filter == VXC_FM_Min)       { VXC_Filter_Min(Dest, Src0, Src1, Src2, Info); }      \
         if (filter == VXC_FM_Median)    { VXC_Filter_Median(Dest, Src0, Src1, Src2, Info); }   \
-    } while (0)
+    } 
 
 #else   /* VX1 */
 
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/viv_nn_compatibility.h b/prebuilt-sdk/x86_64_linux/include/VX/viv_nn_compatibility.h
index 6c1e9f5..b6cc5be 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/viv_nn_compatibility.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/viv_nn_compatibility.h
@@ -98,7 +98,9 @@ vxCreateTensor_11(
     vx_enum       data_format,
     vx_int8       fixed_point_pos
     );
+#if !VX_VA40_EXT_SUPPORT
 #define vxCreateTensor    vxCreateTensor_11
+#endif
 
 /* keep the backward compatibility with spec 1.1 for vxCreateVirtualTensor */
 VX_API_ENTRY vx_tensor VX_API_CALL
@@ -108,8 +110,11 @@ vxCreateVirtualTensor_11(
     vx_uint32     *sizes,
     vx_enum       data_format,
     vx_int8       fixed_point_pos
-); 
+);
+
+#if !VX_VA40_EXT_SUPPORT
 #define vxCreateVirtualTensor    vxCreateVirtualTensor_11
+#endif
 
 /* keep the backward compatibility with spec 1.1 for vxCreateTensorFromView */
 VX_API_ENTRY vx_tensor VX_API_CALL
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
index 6c3671e..782961c 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_kernels.h
@@ -496,6 +496,8 @@ enum vx_kernel_e {
 
     VX_KERNEL_NN_BATCH_GEMM = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x2F,
 
+    VX_KERNEL_NN_CONV_3D_LAYER = VX_KERNEL_BASE(VX_ID_VIVANTE, VX_LIBRARY_KHR_BASE) + 0x30,
+
     VX_KERNEL_MAX_1_2, /*!< \internal Used for VX1.2 bounds checking in the conformance test. */
 };
 
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
index e3baa23..d6d9b93 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_compatible.h
@@ -33,44 +33,58 @@
  0: weight_layout is whnc 
  1: weight_layout is whcn
 */
+#ifndef VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS
 #define VX_DECONVOLUTION_WEIGHT_LAYOUT_COMPATIBLE_KHRONOS 1
+#endif
 /*
  VX_CONVERT_POLICY_WRAP_ENABLE is used to differentiate two overflow_policys(VX_CONVERT_POLICY_WRAP and VX_CONVERT_POLICY_SAT)
  [value]
  0: both overflow_policys considered as VX_CONVERT_POLICY_SAT
  1: overflow_policy is determined by arguments.
 */
+#ifndef VX_CONVERT_POLICY_WRAP_ENABLE
 #define VX_CONVERT_POLICY_WRAP_ENABLE 1
+#endif
 
+#ifndef VX_13_NN_COMPATIBLITY
 #define VX_13_NN_COMPATIBLITY 1
+#endif
 /*
  VX_L2NORM_AXIS_PARAMETER_SUPPORT is used to declare that L2NORMALIZE can support axis parameter
  [value]
  0: not support
  1: support
 */
+#ifndef VX_L2NORM_AXIS_PARAMETER_SUPPORT
 #define VX_L2NORM_AXIS_PARAMETER_SUPPORT 1
+#endif
 /*
  VX_SOFTMAX_AXIS_PARAMETER_SUPPORT is used to declare that SOFTAMX can support axis parameter
  [value]
  0: not support
  1: support
 */
+#ifndef VX_SOFTMAX_AXIS_PARAMETER_SUPPORT
 #define VX_SOFTMAX_AXIS_PARAMETER_SUPPORT 1
+#endif
 /*
  VX_NORMALIZATION_AXIS_PARAMETER_SUPPORT is used to declare that NORMALIZATION can support axis parameter
  [value]
  0: not support
  1: support
 */
+#ifndef VX_NORMALIZATION_AXIS_PARAMETER_SUPPORT
 #define VX_NORMALIZATION_AXIS_PARAMETER_SUPPORT 1
+#endif
 /*
  VX_ACTIVATION_EXT_SUPPORT is used to declare that ACTIVATION can support swish and hswish
  [value]
  0: not support
  1: support
 */
+#ifndef VX_ACTIVATION_EXT_SUPPORT
 #define VX_ACTIVATION_EXT_SUPPORT 1
+#endif
 
 /*
  VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT is used to query more hardware parameter such as shader sub-group size.
@@ -78,7 +92,19 @@
  0: not support
  1: support
 */
+#ifndef VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
 #define VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT 1
+#endif
+
+/*
+ VX_VA40_EXT_SUPPORT is used to declare that openvx can support VA40.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_VA40_EXT_SUPPORT
+#define VX_VA40_EXT_SUPPORT 0
+#endif
 
 /*
  VX_USER_LOOKUP_TABLE_SUPPORT is used to declare that openvx can support user lookuptable.
@@ -86,7 +112,9 @@
  0: not support
  1: support
 */
+#ifndef VX_USER_LOOKUP_TABLE_SUPPORT
 #define VX_USER_LOOKUP_TABLE_SUPPORT 1
+#endif
 
 /*
 VX_PRELOAD_CONST_TENSOR_SUPPORT is used to declare that openvx can support preload weight/bias and const tensor
@@ -94,7 +122,9 @@ VX_PRELOAD_CONST_TENSOR_SUPPORT is used to declare that openvx can support prelo
  0: not support
  1: support(NN conv and TP FC weightbias, and SH const tensor)
 */
+#ifndef VX_PRELOAD_CONST_TENSOR_SUPPORT
 #define VX_PRELOAD_CONST_TENSOR_SUPPORT 1
+#endif
 
 /*
 VX_CREATE_TENSOR_SUPPORT_PHYSICAL is used to declare that openvx can support physical address for vxCreateTensorFromHandle
@@ -102,7 +132,9 @@ VX_CREATE_TENSOR_SUPPORT_PHYSICAL is used to declare that openvx can support phy
  0: not support
  1: support
 */
+#ifndef VX_CREATE_TENSOR_SUPPORT_PHYSICAL
 #define VX_CREATE_TENSOR_SUPPORT_PHYSICAL 1
+#endif
 
 /*
  VX_GRAPH_PREEMPTION_SUPPORT is used to declare that openvx can support different graph preemption function.
@@ -110,7 +142,9 @@ VX_CREATE_TENSOR_SUPPORT_PHYSICAL is used to declare that openvx can support phy
  0: not support
  1: support
 */
+#ifndef VX_GRAPH_PREEMPTION_SUPPORT
 #define VX_GRAPH_PREEMPTION_SUPPORT 1
+#endif
 
 /*
 VX_BATCH_GEMM_API_SUPPORT is used to declare that vsi openvx driver can support vxBatchGemmNode API to transform gemm to convolution
@@ -118,6 +152,18 @@ VX_BATCH_GEMM_API_SUPPORT is used to declare that vsi openvx driver can support
  0: not support
  1: support
 */
+#ifndef VX_BATCH_GEMM_API_SUPPORT
 #define VX_BATCH_GEMM_API_SUPPORT 1
+#endif
+
+/*
+VX_CONV_3D_API_SUPPORT is used to declare that vsi openvx driver can support conv3d by vxConv3dLayer API.
+ [value]
+ 0: not support
+ 1: support
+*/
+#ifndef VX_CONV_3D_API_SUPPORT
+#define VX_CONV_3D_API_SUPPORT 1
+#endif
 
 #endif /* __VX_KHR_COMPATIBLE_H__ */
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
index 6f1c478..88a9967 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn.h
@@ -29,6 +29,7 @@
 #define OPENVX_KHR_NN   "vx_khr_nn"
 
 #include <VX/vx.h>
+#include <VX/vx_khr_compatible.h>
 #include <VX/vx_khr_nn_internal.h>
 
 
@@ -310,10 +311,47 @@ enum vx_tensor_lifetime_type_e
     VX_TENSOR_LIFE_TIME_DYNAMIC,
 };
 
+typedef struct _vx_nn_convolution_3d_params_t
+{
+    vx_int32 padding_w_left;                 /*!< \brief Number of elements added at each side in the left of w dimension of the input. */
+    vx_int32 padding_w_right;                /*!< \brief Number of elements added at each side in the right of w dimension of the input. */
+    vx_int32 padding_h_top;                  /*!< \brief Number of elements added at each side in the top of h dimension of the input. */
+    vx_int32 padding_h_bottom;               /*!< \brief Number of elements added at each side in the bottom of h dimension of the input. */
+    vx_int32 padding_d_front;                /*!< \brief Number of elements added at each side in the front of d dimension of the input. */
+    vx_int32 padding_d_rear;                 /*!< \brief Number of elements added at each side in the rear of d dimension of the input. */
+
+    vx_int32 stride_w;                       /*!< \brief  skip w jump for down scale.  */
+    vx_int32 stride_h;                       /*!< \brief  skip h jump for down scale.  */
+    vx_int32 stride_d;                       /*!< \brief  skip d jump for down scale.  */
+    vx_int32 dilation_w;                     /*!< \brief "inflate" the kernel by inserting zeros between the kernel elements in the w direction. The value is the number of zeros to insert.*/
+    vx_int32 dilation_h;                     /*!< \brief "inflate" the kernel by inserting zeros between the kernel elements in the h direction. The value is the number of zeros to insert.*/
+    vx_int32 dilation_d;                     /*!< \brief "inflate" the kernel by inserting zeros between the kernel elements in the d direction. The value is the number of zeros to insert.*/
+
+    vx_enum pad_mode;                       /*!< \brief A VX_TYPE_ENUM of the <tt> \ref vx_pad_mode_e </tt> enumeration. */
+    vx_scalar pad_const;                    /*!< \brief pad const value if setting pad mode to const, the const value is base value, not quantized value. */
+
+    vx_enum overflow_policy;                /*!< \brief A <tt> VX_TYPE_ENUM</tt> of the <tt> vx_convert_policy_e</tt> enumeration. */
+    vx_enum rounding_policy;                /*!< \brief A <tt> VX_TYPE_ENUM</tt> of the <tt> vx_round_policy_e</tt> enumeration. */
+    vx_enum down_scale_size_rounding;       /*!< \brief Rounding method for calculating output dimensions. See <tt>\ref vx_nn_rounding_type_e</tt> */
+
+    vx_int32 depth_multiplier;              /*!< \brief depthwise multiplier value, if 0, means convolution, elsewise(>=1), the convolution is depthwiseconvolution. */
+}vx_nn_convolution_3d_params_t;
+
 /*==============================================================================
     TENSOR DATA FUNCTIONS
 =============================================================================*/
-
+#if VX_VA40_EXT_SUPPORT
+/*! \brief Create  an opaque reference to a tensor view object.
+ * \details Not guaranteed to exist until the <tt>vx_graph</tt> containing it has been verified.
+ * \param [in] context The reference to the implementation context.
+ * \param [in] view_array_start a vx_size array of start values of the view.
+ * \param [in] view_array_end a vx_size array of end values of the view.
+ * \param [in] numViewDimensions number of dimensions of view_array_start and view_array_end.
+ * \return A tensor data view reference or zero when an error is encountered.
+ * \ingroup group_tensor
+ */
+VX_API_ENTRY vx_tensor_view VX_API_CALL vxCreateTensorView(vx_context context, vx_size* view_array_start, vx_size* view_array_end, vx_size numViewDimensions);
+#else
 /*! \brief Create  an opaque reference to a tensor view object.
  * \details Not guaranteed to exist until the <tt>vx_graph</tt> containing it has been verified.
  * \param [in] context The reference to the implementation context.
@@ -324,6 +362,7 @@ enum vx_tensor_lifetime_type_e
  * \ingroup group_tensor
  */
 VX_API_ENTRY vx_tensor_view VX_API_CALL vxCreateTensorView(vx_context context, vx_uint32 *view_array_start, vx_uint32 * view_array_end, vx_uint8 numViewDimensions);
+#endif
 
 /*! \brief Releases a reference to a tensor data view object.
 * The object may not be garbage collected until its total reference count is zero.
@@ -337,6 +376,18 @@ VX_API_ENTRY vx_tensor_view VX_API_CALL vxCreateTensorView(vx_context context, v
 */
 VX_API_ENTRY vx_status VX_API_CALL vxReleaseTensorView(vx_tensor_view *tensor_view);
 
+#if VX_VA40_EXT_SUPPORT
+/*! \brief Create  an opaque reference to a tensor addressing object.
+* \details Not guaranteed to exist until the <tt>vx_graph</tt> containing it has been verified.
+* \param [in] context The reference to the implementation context.
+* \param [in] addressing_array_dimension a vx_size array of sLength of patch in all dimensions in elements.
+* \param [in] addressing_array_stride a vx_size arrayStride in all dimensions in bytes.
+* \param [in] numViewDimensions number of dimensions of view_array_start and view_array_end.
+* \return A tensor data view reference or zero when an error is encountered.
+* \ingroup group_tensor
+*/
+VX_API_ENTRY vx_tensor_addressing VX_API_CALL vxCreateTensorAddressing(vx_context context, vx_size* addressing_array_dimension, vx_size* addressing_array_stride, vx_size numViewDimensions);
+#else
 /*! \brief Create  an opaque reference to a tensor addressing object.
 * \details Not guaranteed to exist until the <tt>vx_graph</tt> containing it has been verified.
 * \param [in] context The reference to the implementation context.
@@ -346,7 +397,8 @@ VX_API_ENTRY vx_status VX_API_CALL vxReleaseTensorView(vx_tensor_view *tensor_vi
 * \return A tensor data view reference or zero when an error is encountered.
 * \ingroup group_tensor
 */
-VX_API_ENTRY vx_tensor_addressing VX_API_CALL vxCreateTensorAddressing(vx_context context, vx_uint32 *addressing_array_dimension, vx_uint32 * addressing_array_stride, vx_uint8 numViewDimensions);
+VX_API_ENTRY vx_tensor_addressing VX_API_CALL vxCreateTensorAddressing(vx_context context, vx_uint32 * addressing_array_dimension, vx_uint32 * addressing_array_stride, vx_uint8 numViewDimensions);
+#endif
 
 /*! \brief Releases a reference to a tensor data addressing object.
 * The object may not be garbage collected until its total reference count is zero.
@@ -402,7 +454,11 @@ typedef union _vx_tensor_quant_param
 typedef struct _vx_tensor_create_params_t
 {
     vx_uint32       num_of_dims; /*!< \brief The number of dimensions specified in *sizes*/
+#if VX_VA40_EXT_SUPPORT
+    vx_size *       sizes;       /*!< \brief The pointer to an array of dimension */
+#else
     vx_uint32 *     sizes;       /*!< \brief The pointer to an array of dimension */
+#endif
     vx_enum         data_format; /*!< \brief Data format for the tensor */
     vx_enum         quant_format; /*!< \brief Quantized format <tt>\ref vx_quantized_format_e </tt>. */
     vx_tensor_quant_param quant_data;
@@ -482,7 +538,19 @@ VX_API_ENTRY vx_tensor VX_API_CALL vxCreateTensorFromHandle2(
 */
 VX_API_ENTRY vx_status VX_API_CALL vxFlushHandle(vx_reference ref);
 
-
+#if VX_VA40_EXT_SUPPORT
+/*! \brief Return a new tensor referencing the same memory location but with different shape.
+* \param [in] tensor The input tensor data to reshape.
+* \param [in] num_of_dims Size of each dimension. If one component is special value -1,
+* the size of that dimension is computed so that the total size remains the same as input tensor.
+* If is is [-1], then flatten is performed which turns tensor into 1-D.
+* \param [in] sizes The size of the container to which \a num_of_dims points.
+* \return a vx_tensor that has shaped.
+* \return VX_NULL if an error occurred.
+* \ingroup group_tensor
+*/
+VX_API_ENTRY vx_tensor VX_API_CALL vxReshapeTensor(vx_tensor tensor, vx_size* num_of_dims, vx_size sizes);
+#else
 /*! \brief Return a new tensor referencing the same memory location but with different shape.
 * \param [in] tensor The input tensor data to reshape.
 * \param [in] num_of_dims Size of each dimension. If one component is special value -1,
@@ -494,6 +562,7 @@ VX_API_ENTRY vx_status VX_API_CALL vxFlushHandle(vx_reference ref);
 * \ingroup group_tensor
 */
 VX_API_ENTRY vx_tensor VX_API_CALL vxReshapeTensor(vx_tensor tensor, vx_int32* num_of_dims, vx_uint32 sizes);
+#endif
 
 /*! \brief Allows setting attributes on the tensor.
  * \param [in] tensor The reference to the tensor on which to set the attribute.
@@ -1961,6 +2030,7 @@ typedef struct _vx_hardware_caps_params_ext_t
 {
     vx_hardware_caps_params_t base;
     vx_uint32 subGroupSize;        /*!< \brief  shader sub-group size.*/
+    vx_bool   supportVA40;         /*!< \brief  support 40bit virtual address.*/
 } vx_hardware_caps_params_ext_t;
 
 /*! \brief Queries hardware caps information.
@@ -1979,6 +2049,29 @@ VX_API_ENTRY vx_status VX_API_CALL vxQueryHardwareCaps(
     vx_size                             size_of_hardware_caps_param
     );
 
+/*! \brief [Graph] Creates a Convolutional-3d Network Convolution Layer Node.
+ * \details This function implement Convolutional-3d Network Convolution layer.
+ *  For fixed-point data types, a fixed point calculation is performed with round and saturate according to the number of accumulator bits. The number of the accumulator bits are implementation defined,
+ * and should be at least 16.\n
+ * round: rounding according the <tt>vx_round_policy_e</tt> enumeration. \n
+ * saturate: A saturation according the <tt>vx_convert_policy_e</tt> enumeration.
+ * \param [in] graph The handle to the graph.
+ * \param [in] inputs The input tensor data. 4 lower dimensions represent a single input, all following dimensions represent number of batches, possibly nested.
+ * The dimension order is [width, height, depth, #IFM, #batches].\n
+ * \param [in] weights [*static] Weights are 5d tensor with dimensions [kernel_x, kernel_y, kernel_d, #IFM, #OFM].
+ * see <tt>\ref vxCreateTensor2</tt> and <tt>\ref vxCreateVirtualTensor2</tt> \n Weights data type must match the data type of the inputs.  (Kernel parameter #1)
+ * \param [in] biases [*static] Optional, ignored if NULL. The biases, which may be shared (one per ofm) or unshared (one per ofm * output location). The possible layouts are
+ * either [#OFM] or [width, height, #OFM]. Biases data type must match the data type of the inputs.
+ * \param [in] convolution_params [static] Pointer to parameters of type <tt>\ref vx_nn_convolution_3d_params_t</tt>.
+ * \param [in] size_of_convolution_params [static] Size in bytes of convolution_params. Note that this parameter is not counted as one of the kernel parameters.
+ * \param [out] outputs The output tensor data. Output will have the same number and structure of dimensions as input. Output tensor data type must be same as the inputs.
+ * \return <tt> vx_node</tt>.
+ * \returns A node reference <tt>\ref vx_node</tt>. Any possible errors preventing a
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_node VX_API_CALL vxConv3dLayer(vx_graph graph, vx_tensor inputs, vx_tensor weights, vx_tensor biases, const vx_nn_convolution_3d_params_t *convolution_params, vx_size size_of_convolution_params, vx_tensor outputs);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
index 41e1653..506938f 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_khr_nn_internal.h
@@ -289,6 +289,169 @@ typedef struct _vx_weights_biases_parameter_optimizations_ext2_t {
     vx_int8  output_fpp_dw;       /*depthwise conv output fix-point*/
 } vx_weights_biases_parameter_optimizations_ext2_t;
 
+#if VX_VA40_EXT_SUPPORT
+/*!
+ * \brief Creates a reference to a vx_weights_biases_parameter opaque object.
+ *
+ * \param [in] layer_type                The network type of objects to hold. Types allowed are: 
+ *                                           \arg VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER for convolution layer.
+ *                                           \arg VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER for fullyconnected layer.
+ * \param [in] num_of_dims               The dimention number of input & output image tensor.
+ * \param [in] inputs_dims               The input tensor's dimension size.
+ * \param [in] pad_x                     The number of elements subtracted at each side in the x dimension of the input.
+ * \param [in] pad_y                     The number of elements subtracted at each side in the y dimension of the input.
+ * \param [in] pooling_size_x            The size of the pooling region in the x dimension, 0 means no pooling operation.
+ * \param [in] pooling_size_y            The size of the pooling region in the y dimension, 0 means no pooling operation.
+ * \param [in] down_scale_size_rounding  A <tt> VX_TYPE_ENUM</tt> of the <tt> vx_round_policy_e</tt> enumeration.
+ * \param [in] convolution_outputs_dims  The output's dimension size after covolution operation.
+ * \param [in] pool_outputs_dims         The output's dimension size after pooling operation.
+ * \param [in] optimizations             A optional param for <tt>\ref vx_weights_biases_parameter_optimizations_t</tt>.
+ * \param [in] weights                   The weights tensor which need be compressed.
+ * \param [in] biases                    The biases tensor which need be compressed.
+ *
+ * \returns An opaque vx_weights_biases_parameter reference with compressed kernel data. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ *
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_weights_biases_parameter VX_API_CALL 
+vxCreateWeightsBiasesParameterFromTensors(
+    vx_enum layer_type,
+    vx_size num_of_dims,
+    vx_size * inputs_dims,
+    vx_uint32 pad_x,
+    vx_uint32 pad_y,
+    vx_uint32 pooling_size_x,
+    vx_uint32 pooling_size_y,
+    vx_enum down_scale_size_rounding,
+    vx_size   * convolution_outputs_dims,
+    vx_size   * pool_outputs_dims,
+    vx_weights_biases_parameter_optimizations_t *optimizations,
+    vx_tensor weights, 
+    vx_tensor biases);
+
+/*!
+ * \brief Creates a reference to an opaque vx_weights_biases_parameter object.
+ *
+ * \param [in] layer_type                              The network type of objects to hold. Types allowed are: 
+ *                                                         \arg VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER for convolution layer.
+ *                                                         \arg VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER for fullyconnected layer.
+ * \param [in] num_of_dims                             The dimention number of input & output image tensor.
+ * \param [in] inputs_dims                             The input tensor's dimension size.
+ * \param [in] convolution_outputs_dims                The output's dimension size after covolution operation.
+ * \param [in] pool_outputs_dims                       The output's dimension size after pooling operation.
+ * \param [in] output_format                           The output tensor element type.
+ * \param [in] convolution_relu_pooling_params         The convolution_relu_pooling_params Pointer to parameters of type <tt>\ref vx_nn_convolution_relu_pooling_params_t</tt>
+ * \param [in] size_of_convolution_relu_pooling_params The size in bytes of convolution_relu_pooling_params.
+ * \param [in] optimizations                           A optional param for <tt>\ref vx_weights_biases_parameter_optimizations_t</tt>.
+ * \param [in] weights                                 The weights tensor which need be compressed.
+ * \param [in] biases                                  The biases tensor which need be compressed.
+ *
+ * \returns An opaque vx_weights_biases_parameter reference with compressed kernel data. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ *
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_weights_biases_parameter VX_API_CALL vxCreateWeightsBiasesParameterFromTensors2(
+    vx_enum     layer_type,
+    vx_size     num_of_dims,
+    vx_size   * inputs_dims,
+    vx_size   * convolution_outputs_dims,
+    vx_size   * pool_outputs_dims,
+    vx_enum     output_format,
+    const vx_nn_convolution_relu_pooling_params convolution_relu_pooling_params,
+    vx_size size_of_convolution_relu_pooling_params,
+    vx_weights_biases_parameter_optimizations_t *optimizations,
+    vx_tensor   weights,
+    vx_tensor   biases);
+
+/*!
+ * \brief Creates a reference to an opaque vx_weights_biases_parameter object.
+ *
+ * \param [in] layer_type                              The network type of objects to hold. Types allowed are: 
+ *                                                         \arg VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER for convolution layer.
+ *                                                         \arg VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER for fullyconnected layer.
+ * \param [in] inputs_dims                             The input tensor's dimension size.
+ * \param [in] convolution_outputs_dims                The output's dimension size after covolution operation.
+ * \param [in] pool_outputs_dims                       The output's dimension size after pooling operation.
+ * \param [in] convolution_relu_pooling_params         The convolution_relu_pooling_params Pointer to parameters of type <tt>\ref vx_nn_convolution_relu_pooling_params_t</tt>
+ * \param [in] size_of_convolution_relu_pooling_params The size in bytes of convolution_relu_pooling_params.
+ * \param [in] optimizations                           A optional param for <tt>\ref vx_weights_biases_parameter_optimizations_t</tt>.
+ * \param [in] size_of_optimizations                   The size in bytes of optimizations.
+ * \param [in] weights                                 The weights tensor which need be compressed.
+ * \param [in] biases                                  The biases tensor which need be compressed.
+ *
+ * \returns An opaque vx_weights_biases_parameter reference with compressed kernel data. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ *
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_weights_biases_parameter VX_API_CALL vxCreateWeightsBiasesParameterFromTensors3(
+    vx_enum     layer_type,
+    vx_size   * inputs_dims,
+    vx_size   * convolution_outputs_dims,
+    vx_size   * pool_outputs_dims,
+    const vx_nn_convolution_relu_pooling_params convolution_relu_pooling_params,
+    vx_size size_of_convolution_relu_pooling_params,
+    vx_weights_biases_parameter_optimizations_t *optimizations,
+    vx_size size_of_optimizations,
+    vx_tensor   weights,
+    vx_tensor   biases);
+
+/*!
+ * \brief Creates a reference to an vx_weights_biases_parameter object.
+ * \param [in] context                   The OpenVX context object.
+ * \param [in] layer_type                The network type of objects to hold. Types allowed are: 
+ *                                           \arg VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER for convolution layer.
+ *                                           \arg VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER for fullyconnected layer.
+ * \param [in] num_of_dims               The dimention number of input & output image tensor.
+ * \param [in] inputs_dims               The input tensor's dimension size.
+ * \param [in] pad_x                     The number of elements subtracted at each side in the x dimension of the input.
+ * \param [in] pad_y                     The number of elements subtracted at each side in the y dimension of the input.
+ * \param [in] pooling_size_x            The size of the pooling region in the x dimension, 0 means no pooling operation.
+ * \param [in] pooling_size_y            The size of the pooling region in the y dimension, 0 means no pooling operation.
+ * \param [in] down_scale_size_rounding  A <tt> VX_TYPE_ENUM</tt> of the <tt> vx_round_policy_e</tt> enumeration.
+ * \param [in] convolution_outputs_dims  The output's dimension size after covolution operation.
+ * \param [in] pool_outputs_dims         The output's dimension size after pooling operation.
+ * \param [in] weights_num_of_dims       The dimention number of weights tensor.
+ * \param [in] weights_dims              The dimention size of weights tensor.
+ * \param [in] weights_data_format       The format of weights tensor.
+ * \param [in] weights_fixed_point_pos   The fixed point position when the weights element type is int16/int8, if 0 calculations are performed in integer math.
+ * \param [in] biases_num_of_dims        The dimention number of biases tensor.
+ * \param [in] biases_dims               The dimention size of biases tensor.
+ * \param [in] biases_data_format        The format of biases tensor.
+ * \param [in] biases_fixed_point_pos    The fixed point position when the biases element type is int16/int8, if 0 calculations are performed in integer math.
+ * \param [in] raw_data_size             The data size of compressed data.
+ *
+ * \returns A weightsbiases reference without compressed kernel data <tt>vx_weights_biases_parameter</tt>. Any possible errors preventing a 
+ * successful creation should be checked using <tt>\ref vxGetStatus</tt>.
+ *
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_weights_biases_parameter VX_API_CALL
+vxCreateWeightsBiasesParameter(
+    vx_context context,
+    vx_enum layer_type,
+    vx_size num_of_dims,
+    vx_size * inputs_dims,
+    vx_uint32 pad_x,
+    vx_uint32 pad_y,
+    vx_uint32 pooling_size_x,
+    vx_uint32 pooling_size_y,
+    vx_enum   down_scale_size_rounding,
+    vx_size   * convolution_outputs_dims,
+    vx_size   * pool_outputs_dims,
+    vx_size   weights_num_of_dims,
+    vx_size   * weights_dims,
+    vx_enum weights_data_format,
+    vx_int8 weights_fixed_point_pos,
+    vx_size biases_num_of_dims,
+    vx_size * biases_dims,
+    vx_enum biases_data_format,
+    vx_int8 biases_fixed_point_pos,
+    vx_uint32 raw_data_size
+    );
+#else
 /*!
  * \brief Creates a reference to a vx_weights_biases_parameter opaque object.
  *
@@ -397,17 +560,6 @@ VX_API_ENTRY vx_weights_biases_parameter VX_API_CALL vxCreateWeightsBiasesParame
     vx_tensor   weights,
     vx_tensor   biases);
 
-/*! \brief Releases the OpenVX object vx_weights_biases_parameter.
- * \param [in] weights_bias The pointer to the reference to the vx_weights_biases_parameter.
- * \post After returning from this function the reference is zeroed.
- * \return A <tt>\ref vx_status_e</tt> enumeration.
- * \retval VX_SUCCESS No errors.
- * \retval VX_ERROR_INVALID_REFERENCE If weights_bias is not a <tt> vx_weights_biases_parameter</tt>.
- * \pre <tt>\ref vxCreateWeightsBiasesParameterFromTensors / vxCreateWeightsBiasesParameterFromTensors2/ vxCreateWeightsBiasesParameter / vxCreateWeightsBiasesParameterFromStream</tt>
- * \ingroup group_cnn
- */
-VX_API_ENTRY vx_status VX_API_CALL vxReleaseWeightsBiasesParameter(vx_weights_biases_parameter *weights_bias);
-
 /*!
  * \brief Creates a reference to an vx_weights_biases_parameter object.
  * \param [in] context                   The OpenVX context object.
@@ -461,7 +613,18 @@ vxCreateWeightsBiasesParameter(
     vx_int8 biases_fixed_point_pos,
     vx_uint32 raw_data_size
     );
+#endif
 
+/*! \brief Releases the OpenVX object vx_weights_biases_parameter.
+ * \param [in] weights_bias The pointer to the reference to the vx_weights_biases_parameter.
+ * \post After returning from this function the reference is zeroed.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS No errors.
+ * \retval VX_ERROR_INVALID_REFERENCE If weights_bias is not a <tt> vx_weights_biases_parameter</tt>.
+ * \pre <tt>\ref vxCreateWeightsBiasesParameterFromTensors / vxCreateWeightsBiasesParameterFromTensors2/ vxCreateWeightsBiasesParameter / vxCreateWeightsBiasesParameterFromStream</tt>
+ * \ingroup group_cnn
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxReleaseWeightsBiasesParameter(vx_weights_biases_parameter *weights_bias);
 /*! \brief Input parameters for a gru operation.
  * \ingroup group_cnn
  * \version 0.5
@@ -900,6 +1063,7 @@ VX_API_ENTRY vx_node VX_API_CALL vxTensorTableLookupLayer(
     vx_lut InLut,
     vx_lut OutLut,
     vx_tensor output);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
index 0881c15..51bf129 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_types.h
@@ -444,6 +444,11 @@ enum vx_type_e {
  * \ingroup group_basic_features
  */
 enum vx_status_e {
+    VX_ERROR_VENDOR_VSI_END             = -2000, /*!< \brief A vendor defined error status end base. */
+    /* add new error here*/
+    VX_ERROR_CANCEL_JOB                 = -1001, /*!< \brief Indicates that a VIP job was cancelled. */
+    VX_ERROR_VENDOR_VSI_START           = -1000, /*!< \brief A vendor defined error status start base. */
+
     VX_STATUS_MIN                       = -25,/*!< \brief Indicates the lower bound of status codes in VX. Used for bounds checks only. */
     /* add new codes here */
     VX_ERROR_REFERENCE_NONZERO          = -24,/*!< \brief Indicates that an operation did not complete due to a reference count being non-zero. */
@@ -718,6 +723,8 @@ enum vx_graph_state_e {
    VX_GRAPH_STATE_ABANDONED = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_GRAPH_STATE) + 0x3,
    /*! \brief The graph execution is completed and the graph is not scheduled for execution */
    VX_GRAPH_STATE_COMPLETED = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_GRAPH_STATE) + 0x4,
+   /*! \brief The graph execution was cancelled */
+   VX_GRAPH_STATE_CANCELLED = VX_ENUM_BASE(VX_ID_KHRONOS, VX_ENUM_GRAPH_STATE) + 0x5,
 };
 
 /*! \brief The graph attributes list.
diff --git a/prebuilt-sdk/x86_64_linux/include/VX/vx_viv_sys.h b/prebuilt-sdk/x86_64_linux/include/VX/vx_viv_sys.h
index f97512f..e31ba0d 100644
--- a/prebuilt-sdk/x86_64_linux/include/VX/vx_viv_sys.h
+++ b/prebuilt-sdk/x86_64_linux/include/VX/vx_viv_sys.h
@@ -53,6 +53,19 @@ VX_API_ENTRY vx_status VX_API_CALL vxSysSetVipFrequency(
     vx_uint32 shaderFscaleValue
     );
 
+/*! \brief cancel all VIP processing jobs.
+ * \param [in] context The reference to the implementation context.
+ * \return A <tt>\ref vx_status_e</tt> enumeration.
+ * \retval VX_SUCCESS Cancelled all VIP processing job successfully
+ *                    and user can check return of vxProcessGraph() to get cancelled status.
+ * \retval VX_ERROR_INVAID_PARAMETERS Invalid context reference.
+ * \retval VX_ERROR_NOT_SUPPORTED Hardware does not support job cancellation.
+ * \retval VX_FAILURE Failed to cancel VIP proccessing job.
+ */
+VX_API_ENTRY vx_status VX_API_CALL vxSysCancelJob(
+    vx_context context
+    );
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so
index 575b344..4831755 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so and b/prebuilt-sdk/x86_64_linux/lib/libArchModelSw.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libCLC.so b/prebuilt-sdk/x86_64_linux/lib/libCLC.so
index ff87c25..e9101a5 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libCLC.so and b/prebuilt-sdk/x86_64_linux/lib/libCLC.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so
index dbd7197..2d30e1e 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libEmulator.so and b/prebuilt-sdk/x86_64_linux/lib/libEmulator.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libGAL.so b/prebuilt-sdk/x86_64_linux/lib/libGAL.so
index 0439666..690ba12 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libGAL.so and b/prebuilt-sdk/x86_64_linux/lib/libGAL.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so
index 9a4e15c..6a2cefc 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so and b/prebuilt-sdk/x86_64_linux/lib/libNNArchPerf.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0
index 99ec9c8..29fffa4 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 and b/prebuilt-sdk/x86_64_linux/lib/libOpenVX.so.1.3.0 differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so b/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so
index 44e37de..e33fc05 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so and b/prebuilt-sdk/x86_64_linux/lib/libOpenVXC.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so
index 50c2a10..0d2a6c0 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so and b/prebuilt-sdk/x86_64_linux/lib/libOpenVXU.so differ
diff --git a/prebuilt-sdk/x86_64_linux/lib/libVSC.so b/prebuilt-sdk/x86_64_linux/lib/libVSC.so
index 07646f8..e8b7c99 100755
Binary files a/prebuilt-sdk/x86_64_linux/lib/libVSC.so and b/prebuilt-sdk/x86_64_linux/lib/libVSC.so differ
diff --git a/src/tim/vx/internal/BUILD b/src/tim/vx/internal/BUILD
index 186c6a9..392f1ec 100644
--- a/src/tim/vx/internal/BUILD
+++ b/src/tim/vx/internal/BUILD
@@ -25,7 +25,6 @@ filegroup(
     srcs = glob([
         "include/kernel/cl/*.h",
         "include/kernel/evis/*.h",
-        "include/kernel/cpu/*.h",
     ])
 )
 
@@ -34,7 +33,6 @@ filegroup(
     srcs = glob([
         "src/kernel/cl/*.c",
         "src/kernel/evis/*.c",
-        "src/kernel/cpu/*.c",
         "src/kernel/vx/*.c",
     ])
 )
@@ -137,6 +135,7 @@ cc_library(
         "include/kernel/vsi_nn_kernel_eltwise.h",
         "include/kernel/vsi_nn_kernel_node.h",
         "include/kernel/vsi_nn_kernel_gpu_shape_optimize.h",
+        "include/kernel/vsi_nn_kernel_lut.h",
         "include/vsi_nn_error.h",
 
         # libnnext
@@ -193,6 +192,7 @@ cc_library(
         "src/kernel/vsi_nn_kernel_selector.c",
         "src/kernel/vsi_nn_kernel_node.c",
         "src/kernel/vsi_nn_kernel_param.c",
+        "src/kernel/vsi_nn_kernel_lut.c",
         "src/kernel/vsi_nn_gpu.c",
         "src/kernel/vsi_nn_kernel_gpu_shape_optimize.c",
         "src/libnnext/vsi_nn_libnnext_resource.c",
diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def
index 0590aad..cf5bebb 100644
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@@ -163,3 +163,5 @@ DEF_OP(CONV2D_LSTM_CELL)
 DEF_OP(GRU)
 DEF_OP(GRUCELL)
 DEF_OP(GRUCELL_ACTIVATION)
+DEF_OP(RESHAPE2)
+DEF_OP(CONV3D)
diff --git a/src/tim/vx/internal/include/internal/internal_ops.def b/src/tim/vx/internal/include/internal/internal_ops.def
index ab04552..06dbc61 100644
--- a/src/tim/vx/internal/include/internal/internal_ops.def
+++ b/src/tim/vx/internal/include/internal/internal_ops.def
@@ -17,3 +17,5 @@ DEF_OP(GRUCELL_ACTIVATION_INTERNAL_SMA)
 DEF_OP(RESIZE_1D_BILINEAR_INTERNAL)
 DEF_OP(RESIZE_1D_NEAREST_INTERNAL)
 DEF_OP(SPACE2DEPTH_INTERNAL)
+DEF_OP(GRUCELL_H_TIMES_ACTIVATION_R)
+DEF_OP(GRUCELL_ACTIVATION_Z_H)
diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
index 73cfcd7..05222b2 100644
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
@@ -48,6 +48,7 @@ typedef enum
     VSI_NN_KERNEL_TYPE_EVIS,
     VSI_NN_KERNEL_TYPE_CL,
     VSI_NN_KERNEL_TYPE_VX,
+    VSI_NN_KERNEL_TYPE_SP,
     VSI_NN_KERNEL_TYPE_NUM,
     VSI_NN_KERNEL_TYPE_NONE = VSI_NN_KERNEL_TYPE_NUM
 } vsi_nn_kernel_type_e;
@@ -75,7 +76,9 @@ typedef enum
     F32,
     F64,
     BF16,
-    BOOL8
+    BOOL8,
+    I4,
+    U4,
 } vsi_nn_kernel_dtype_e;
 
 typedef enum
@@ -303,6 +306,8 @@ const void * vsi_nn_kernel_param_get_const_buffer
     REGISTER_KERNEL_BACKEND(operation, CPU, func)
 #define REGISTER_BACKEND_OPENVX(operation, func) \
     REGISTER_KERNEL_BACKEND(operation, VX, func)
+#define REGISTER_BACKEND_STREAM_PROCESSOR(operation, func) \
+    REGISTER_KERNEL_BACKEND(operation, SP, func)
 
 #define DEF_KERNEL_BASE_CALLBACK( NAME )  \
     static vsi_status NAME##_impl( vsi_nn_kernel_node_t node, \
@@ -478,6 +483,10 @@ static inline vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype
 {
     switch( dtype )
     {
+    case VSI_NN_TYPE_INT4:
+        return I4;
+    case VSI_NN_TYPE_UINT4:
+        return U4;
     case VSI_NN_TYPE_INT8:
         return I8;
     case VSI_NN_TYPE_BOOL8:
@@ -514,6 +523,10 @@ static inline  vsi_nn_type_e vsi_nn_dtype_map_kernel
 {
     switch( dtype )
     {
+    case I4:
+        return VSI_NN_TYPE_INT4;
+    case U4:
+        return VSI_NN_TYPE_UINT4;
     case I8:
         return VSI_NN_TYPE_INT8;
     case BOOL8:
@@ -572,6 +585,38 @@ static inline size_t vsi_nn_kernel_dtype_get_bytes
     return 0;
 } /* vsi_nn_kernel_dtype_get_bytes() */
 
+static inline vsi_size_t vsi_nn_kernel_dtype_get_bits
+    (
+    vsi_nn_kernel_dtype_e dtype
+    )
+{
+    switch( dtype )
+    {
+        case I4:
+        case U4:
+            return 4;
+        case I8:
+        case U8:
+        case BOOL8:
+            return 8;
+        case I16:
+        case U16:
+        case F16:
+        case BF16:
+            return 16;
+        case I32:
+        case U32:
+        case F32:
+            return 32;
+        case I64:
+            return 64;
+        default:
+            VSILOGE("Error data type %d", dtype);
+            break;
+    }
+    return 0;
+} /* vsi_nn_kernel_dtype_get_bits() */
+
 static inline vsi_nn_kernel_quant_type_e vsi_nn_kernel_map_quant_type
     ( vsi_nn_qnt_type_e quant_type )
 {
@@ -615,6 +660,12 @@ static inline void vsi_nn_kernel_scalar_release
     }
 } /* vsi_nn_kernel_scalar_relase() */
 
+vsi_status vsi_nn_kernel_scalar_read_uint4
+    ( vsi_nn_kernel_scalar_t scalar, uint8_t * out_data );
+
+vsi_status vsi_nn_kernel_scalar_read_int4
+    ( vsi_nn_kernel_scalar_t scalar, int8_t * out_data );
+
 vsi_status vsi_nn_kernel_scalar_read_int8
     ( vsi_nn_kernel_scalar_t scalar, int8_t * out_data );
 
@@ -751,25 +802,90 @@ static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_size
 static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_bytes
     ( const vsi_nn_kernel_tensor_attr_t * attr )
 {
-    vsi_size_t size;
-    vsi_size_t type_bytes;
+    vsi_size_t i = 0;
+    vsi_size_t bytes;
+    vsi_size_t bits_num;
+    vsi_size_t * shape = NULL;
     if( !attr )
     {
         return 0;
     }
-    size = vsi_nn_kernel_tensor_attr_get_size( attr );
-    type_bytes = (vsi_size_t)vsi_nn_kernel_dtype_get_bytes( attr->dtype );
-    return size * type_bytes;
+
+    shape = attr->shape->data;
+
+    bits_num = vsi_nn_kernel_dtype_get_bits( attr->dtype );
+    if ( bits_num < BITS_PER_BYTE )
+    {
+        if (shape[0] % 2 == 0)
+        {
+            bytes = shape[0] / 2;
+        }
+        else
+        {
+            bytes = shape[0] / 2 + shape[0] % 2;
+        }
+    }
+    else
+    {
+        bytes = shape[0] * bits_num / BITS_PER_BYTE;
+    }
+    for ( i = 1; i < (vsi_size_t)attr->shape->size; i ++ )
+    {
+        bytes *= shape[i];
+    }
+
+    return bytes;
 } /* vsi_nn_kernel_tensor_attr_get_bytes() */
 
 static inline void vsi_nn_kernel_tensor_attr_get_stride
     ( const vsi_nn_kernel_tensor_attr_t * attr, vsi_size_t * out_stride)
 {
+    vsi_size_t type_bits;
+    vsi_size_t total_bytes;
+    vsi_size_t * shape = NULL;
+
     if( !attr || !out_stride )
     {
         return;
     }
-    vsi_nn_shape_get_stride( attr->shape->data, (vsi_size_t)attr->shape->size, out_stride );
+
+    shape = attr->shape->data;
+    type_bits = vsi_nn_kernel_dtype_get_bits( attr->dtype );
+
+    if ( type_bits < BITS_PER_BYTE )
+    {
+        vsi_size_t i;
+
+        out_stride[0] = type_bits / BITS_PER_BYTE;
+        total_bytes = out_stride[0];
+
+        total_bytes = 1;
+        if ( shape[0] % (BITS_PER_BYTE / type_bits) == 0 )
+        {
+             out_stride[1] = shape[0] * type_bits / BITS_PER_BYTE;
+        }
+        else
+        {
+             out_stride[1] = shape[0] * type_bits / BITS_PER_BYTE + 1;
+        }
+
+        total_bytes *= out_stride[1];
+        for (i = 2; i < (vsi_size_t)attr->shape->size; i++)
+        {
+            out_stride[i] = shape[i - 1] * out_stride[i - 1];
+            total_bytes *= shape[i];
+        }
+        total_bytes *= shape[1];
+
+        for( i = (vsi_size_t)attr->shape->size; i < VSI_NN_MAX_DIM_NUM; i ++ )
+        {
+            out_stride[i] = total_bytes;
+        }
+    }
+    else
+    {
+        vsi_nn_shape_get_stride( attr->shape->data, (vsi_size_t)attr->shape->size, out_stride );
+    }
 } /* vsi_nn_kernel_tensor_attr_get_size() */
 
 static inline vsi_bool vsi_nn_kernel_tensor_attr_is_quantized
@@ -903,12 +1019,115 @@ static inline const char* vsi_nn_kernel_type_str
         return "CL";
     case VSI_NN_KERNEL_TYPE_VX:
         return "OPENVX";
+    case VSI_NN_KERNEL_TYPE_SP:
+        return "STERAM_PROCESSOR";
     default:
         break;
     }
     return "None";
 } /* vsi_nn_kernel_type_str() */
 
+static inline vsi_status vsi_nn_kernel_unpack_4bit_data
+    (
+    const vsi_nn_kernel_tensor_attr_t * attr,
+    uint8_t * src,
+    uint8_t * dest,
+    vsi_nn_kernel_dtype_e dtype
+    )
+{
+    vsi_status status;
+    uint32_t i = 0, j = 0;
+    uint8_t high = 0, low = 0;
+    vsi_size_t stride[VSI_NN_MAX_DIM_NUM] = {0};
+    vsi_size_t src_size;
+
+    status = VSI_SUCCESS;
+    vsi_nn_kernel_tensor_attr_get_stride( attr, stride );
+
+    src_size = stride[attr->shape->size];
+
+    for ( i = 0 ; i < src_size; i++)
+    {
+        high = src[i] >> 4;
+        low = src[i] & 0x0F;
+        if ( dtype == I4 )
+        {
+            if( high > 7)
+            {
+                high = high | 0xF0;
+            }
+            if( low > 7)
+            {
+                low = low | 0xF0;
+            }
+        }
+        if ( attr->shape->data[0] % stride[1] == 0 )
+        {
+            if ( attr->shape->data[0] == 1 )
+            {
+                dest[j] = low;
+                j++;
+            }
+            else
+            {
+                dest[j] = low;
+                dest[j+1] = high;
+                j += 2;
+            }
+        }
+        else
+        {
+            if ( (i+1) % stride[1] == 0 )
+            {
+                dest[j] = low;
+                j++;
+            }
+            else
+            {
+                dest[j] = low;
+                dest[j+1] = high;
+                j += 2;
+            }
+        }
+    }
+
+    return status;
+}
+
+static inline vsi_status vsi_nn_kernel_pack_4bit_data
+    (
+    const vsi_nn_kernel_tensor_attr_t * attr,
+    uint8_t * src,
+    uint8_t * dest
+    )
+{
+    vsi_status status;
+    uint32_t i = 0, j = 0;
+    uint8_t high = 0, low = 0;
+    vsi_size_t src_size;
+
+    status = VSI_SUCCESS;
+    src_size = vsi_nn_kernel_tensor_attr_get_size( attr );
+    for ( i = 0; i < src_size; i++ )
+    {
+        if ( (i+1) % attr->shape->data[0] == 0)
+        {
+            high = 0;
+            low = src[i];
+        }
+        else
+        {
+            high = src[i+1];
+            low = src[i];
+            i++;
+        }
+        dest[j] = (high << 4) | (low & 0xF);
+        j++;
+    }
+
+    return status;
+}
+
 __END_DECLS
 
 #endif
diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
new file mode 100644
index 0000000..f5da0f1
--- /dev/null
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_lut.h
@@ -0,0 +1,75 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_KERNEL_LUT_H
+#define _VSI_NN_KERNEL_LUT_H
+
+#include <stdint.h>
+
+__BEGIN_DECLS
+
+typedef int32_t vsi_nn_kernel_lut_act_e; enum
+{
+    VSI_NN_KERNEL_LUT_NONE             = 0,
+    VSI_NN_KERNEL_LUT_MISH             = 1,
+    VSI_NN_KERNEL_LUT_LOG              = 2,
+    VSI_NN_KERNEL_LUT_EXP              = 3,
+    VSI_NN_KERNEL_LUT_ELU              = 4,
+    VSI_NN_KERNEL_LUT_NEG              = 5,
+    VSI_NN_KERNEL_LUT_HSIGMOID         = 6,
+    VSI_NN_KERNEL_LUT_SOFT_PLUS        = 7,
+    VSI_NN_KERNEL_LUT_ERF              = 8,
+    VSI_NN_KERNEL_LUT_GELU             = 9,
+    VSI_NN_KERNEL_LUT_HGELU            = 10,
+    VSI_NN_KERNEL_LUT_RELU_KERAS       = 11,
+    VSI_NN_KERNEL_LUT_CLIP             = 12,
+    VSI_NN_KERNEL_LUT_SQUARE           = 13,
+};
+
+#define VSI_NN_KERNEL_LUT_MAX_SIZE  (1024)
+#define VSI_NN_KERNEL_LUT_FP16_MAX  (57344)
+#define VSI_NN_KERNEL_LUT_FP16_MIN  (-57344)
+
+typedef struct _vsi_nn_kernel_lut_
+{
+    float index;
+    float val;
+} vsi_nn_kernel_lut_t;
+
+typedef struct  _vsi_nn_kernel_lut_params
+{
+    vsi_enum act_type;
+    float params[16];
+} vsi_nn_kernel_lut_params;
+
+vsi_status vsi_nn_kernel_lut
+    (
+    vx_lut index_lut,
+    vx_lut output_lut,
+    vsi_nn_kernel_lut_params *param
+    );
+
+__END_DECLS
+
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_argmin.h b/src/tim/vx/internal/include/ops/vsi_nn_op_argmin.h
index 7f43ec8..e9d1b70 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_argmin.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_argmin.h
@@ -26,6 +26,9 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 #define VSI_NN_ARGMIN_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \
     VSI_NN_ARGMIN_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL,
@@ -110,5 +113,8 @@ typedef struct _vsi_nn_argmin_param
     int32_t axis;
 } vsi_nn_argmin_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_axis_aligned_bbox_transform.h b/src/tim/vx/internal/include/ops/vsi_nn_op_axis_aligned_bbox_transform.h
index fcdd425..fb88141 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_axis_aligned_bbox_transform.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_axis_aligned_bbox_transform.h
@@ -26,10 +26,17 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_axis_aligned_bbox_transform_param
 {
     vsi_enum     type;
 } vsi_nn_axis_aligned_bbox_transform_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_batchnorm_single.h b/src/tim/vx/internal/include/ops/vsi_nn_op_batchnorm_single.h
index 36ccbfc..f4a4ffe 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_batchnorm_single.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_batchnorm_single.h
@@ -27,6 +27,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* enum for inputs/outputs */
 enum
 {
@@ -50,5 +54,8 @@ typedef struct _vsi_nn_batchnorm_single_param
     float eps;
 } vsi_nn_batchnorm_single_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_lstm.h b/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_lstm.h
index b183d9a..8a4e7cb 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_lstm.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_lstm.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum
 {
     BI_LSTM_INPUT_INPUT             = 0,
@@ -132,5 +136,8 @@ typedef struct _vsi_nn_bidirectional_sequence_lstm_param
     vsi_nn_dtype_t *internal_dtype;
 } vsi_nn_bidirectional_sequence_lstm_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_rnn.h b/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_rnn.h
index 1c59ee3..2bf8c77 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_rnn.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_bidirectional_sequence_rnn.h
@@ -27,6 +27,10 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_op_rnn.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* enum for inputs/outputs */
 enum
 {
@@ -62,5 +66,8 @@ typedef struct _vsi_nn_bidirectional_sequence_rnn_param
     vsi_nn_dtype_t* internal_dtype;
 } vsi_nn_bidirectional_sequence_rnn_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_box_with_nms_limit.h b/src/tim/vx/internal/include/ops/vsi_nn_op_box_with_nms_limit.h
index b4af7e4..505ae8e 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_box_with_nms_limit.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_box_with_nms_limit.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_box_with_nms_limit_param
 {
     float score_threshold;
@@ -36,5 +40,8 @@ typedef struct _vsi_nn_box_with_nms_limit_param
     float nms_score_threshold;
 } vsi_nn_box_with_nms_limit_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_cast.h b/src/tim/vx/internal/include/ops/vsi_nn_op_cast.h
index 86fa568..fd9d3d0 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_cast.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_cast.h
@@ -27,11 +27,18 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_cast_param
 {
     // Add parameters here
     int32_t nothing;
 } vsi_nn_cast_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm.h b/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm.h
index e0eac95..919413e 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum
 {
     CONV2D_LSTM_IN_INPUT        = 0,
@@ -73,4 +77,8 @@ typedef struct _vsi_nn_conv2d_lstm_param
     vsi_nn_conv2d_param conv2d;
 } vsi_nn_conv2d_lstm_param;
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm_cell.h b/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm_cell.h
index bd306ad..9b83aad 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm_cell.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm_cell.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define CONV2D_LSTM_CELL_GATE_NUM 4  // i,f,c,o
 
 enum
@@ -73,4 +77,8 @@ typedef struct _vsi_nn_conv2d_lstm_cell_param
     vsi_nn_conv2d_param conv2d;
 } vsi_nn_conv2d_lstm_cell_param;
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_conv3d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_conv3d.h
new file mode 100644
index 0000000..bf8bf2b
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_conv3d.h
@@ -0,0 +1,58 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_CONV3D_H
+#define _VSI_NN_OP_CONV3D_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_conv3d_param
+{
+    struct _conv3d_local_data_t* local;
+    // Add parameters here
+    /*w, h, d*/
+    int32_t     ksize[3];
+    int32_t     stride[3];
+    int32_t     dilation[3];
+
+    /* Pad left, right, top, bottom, front, rear*/
+    int32_t     pad[6];
+    /* Pad type default value shall be AUTO */
+    vsi_nn_pad_e pad_type;
+    int32_t     weights;
+
+    int32_t      multiplier;
+} vsi_nn_conv3d_param;
+_compiler_assert(offsetof(vsi_nn_conv3d_param, local) == 0, \
+    vsi_nn_conv3d_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_detection_postprocess.h b/src/tim/vx/internal/include/ops/vsi_nn_op_detection_postprocess.h
index 90fa87e..c37e50f 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_detection_postprocess.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_detection_postprocess.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_detection_postprocess_param
 {
     float dy;
@@ -41,5 +45,8 @@ typedef struct _vsi_nn_detection_postprocess_param
     int32_t is_bg_in_label;
 } vsi_nn_detection_postprocess_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_exp.h b/src/tim/vx/internal/include/ops/vsi_nn_op_exp.h
index be7de22..4b5c16e 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_exp.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_exp.h
@@ -26,6 +26,9 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 #define _VSI_NN_EXP_LOCAL_TENSOR_NUM 2
 
@@ -42,5 +45,8 @@ typedef struct _vsi_nn_exp_param
     vsi_nn_exp_lcl_data local;
 } vsi_nn_exp_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_extra_ending.h b/src/tim/vx/internal/include/ops/vsi_nn_op_extra_ending.h
index 4eff2d0..38e132d 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_extra_ending.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_extra_ending.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define _VSI_NN_EXTRA_ENDING_LOCAL_TENSOR_NUM 3
 
 typedef struct _vsi_nn_extra_ending_lcl_data
@@ -44,5 +48,8 @@ typedef struct _vsi_nn_extra_ending_param
     int length;
 } vsi_nn_extra_ending_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_floor.h b/src/tim/vx/internal/include/ops/vsi_nn_op_floor.h
index 4066939..ae70b9c 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_floor.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_floor.h
@@ -26,10 +26,17 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_floor_param
 {
     vsi_enum     type;
 } vsi_nn_floor_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_gelu.h b/src/tim/vx/internal/include/ops/vsi_nn_op_gelu.h
index 5cb011c..dad8b37 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_gelu.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_gelu.h
@@ -27,11 +27,17 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_gelu_param
 {
     vsi_bool approximate;
 } vsi_nn_gelu_param;
 
-
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_generate_proposals.h b/src/tim/vx/internal/include/ops/vsi_nn_op_generate_proposals.h
index 1d5a365..cbe786b 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_generate_proposals.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_generate_proposals.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_generate_proposals_param
 {
     float height_stride;
@@ -37,5 +41,8 @@ typedef struct _vsi_nn_generate_proposals_param
     int32_t type;
 } vsi_nn_generate_proposals_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h
index f9470ee..fa571e9 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv1d.h
@@ -27,11 +27,14 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _grouped_conv1d_local_data_t {
     vsi_nn_tensor_t* input;
     vsi_nn_tensor_t* weight;
     vsi_nn_tensor_t* output;
-
 } grouped_conv1d_local_data_t;
 
 typedef struct _vsi_nn_grouped_conv1d_param
@@ -50,6 +53,8 @@ typedef struct _vsi_nn_grouped_conv1d_param
     int32_t      multiplier;
 } vsi_nn_grouped_conv1d_param;
 
-
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv2d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv2d.h
index 721ebbc..59858c0 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv2d.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv2d.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_grouped_conv2d_param
 {
     uint32_t     ksize[2];
@@ -41,5 +45,8 @@ typedef struct _vsi_nn_grouped_conv2d_param
     void* local;
 } vsi_nn_grouped_conv2d_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_gru.h b/src/tim/vx/internal/include/ops/vsi_nn_op_gru.h
index fc2c24d..4985192 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_gru.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_gru.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* Define the inputs and outputs for GRU Layer */
 enum
 {
@@ -74,5 +78,8 @@ typedef struct _vsi_nn_gru_param
 _compiler_assert(offsetof(vsi_nn_gru_param, local) == 0, \
                  vsi_nn_gru_h );
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_gru_ovxlib.h b/src/tim/vx/internal/include/ops/vsi_nn_op_gru_ovxlib.h
index b4da1fc..19e4172 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_gru_ovxlib.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_gru_ovxlib.h
@@ -27,6 +27,10 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_op_grucell_ovxlib.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* enum for inputs/outputs */
 enum
 {
@@ -74,5 +78,8 @@ typedef struct _vsi_nn_gru_ovxlib_param
     uint32_t cudnn_implementation_version;
 } vsi_nn_gru_ovxlib_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell.h
index 8407bda..da0c08e 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum
 {
     GRUCELL_GATES_Z = 0,
@@ -81,4 +85,8 @@ typedef struct _vsi_nn_grucell_param
 _compiler_assert(offsetof(vsi_nn_grucell_param, local) == 0, \
                  vsi_nn_conv1d_h );
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
\ No newline at end of file
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation.h
index 67a25e5..5eef114 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation.h
@@ -26,11 +26,18 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum {
-    GRUCELL_ACT_IN_H_STATE    = 0,
-    GRUCELL_ACT_IN_INPUT_FC_H = 1,
-    GRUCELL_ACT_IN_H_T        = 2,
-    GRUCELL_ACT_IN_Z_T        = 3,
+    GRUCELL_ACT_H_STATE = 0,
+    GRUCELL_ACT_I_FC_Z  = 1,
+    GRUCELL_ACT_I_FC_R  = 2,
+    GRUCELL_ACT_I_FC_H  = 3,
+    GRUCELL_ACT_H_FC_Z  = 4,
+    GRUCELL_ACT_H_FC_R  = 5,
+    GRUCELL_ACT_H_FC_H  = 6,
 
     GRUCELL_ACT_IN_CNT,
 
@@ -45,8 +52,13 @@ typedef struct _vsi_nn_grucell_activation_param
     struct _vsi_nn_grucell_activation_local * local;
 
     vsi_nn_activation_e activation;
+    vsi_nn_activation_e recurrent_activation;
 } vsi_nn_grucell_activation_param;
 _compiler_assert(offsetof(vsi_nn_grucell_activation_param, local) == 0, \
                  vsi_nn_grucell_activation_h );
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
\ No newline at end of file
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal.h
index fe11a36..7b73d5a 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal.h
@@ -27,6 +27,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum {
     GRUCELL_ACTIVATION_INPUT_ZT_    = 0,
     GRUCELL_ACTIVATION_INPUT_HT__   = 1,
@@ -83,5 +87,8 @@ typedef struct _vsi_nn_grucell_activation_internal_param
     grucell_activation_input_layout_e input_layout;
 } vsi_nn_grucell_activation_internal_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal_sma.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal_sma.h
index 51d76a4..555c81d 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal_sma.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_internal_sma.h
@@ -27,6 +27,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum {
     GRUCELL_ACTIVATION_SMA_INPUT_H_STATE    = 0,
     GRUCELL_ACTIVATION_SMA_INPUT_H_T_       = 1,
@@ -47,5 +51,8 @@ typedef struct _vsi_nn_grucell_activation_internal_sma_param
     vsi_nn_grucell_activation_internal_sma_local* local;
 } vsi_nn_grucell_activation_internal_sma_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_z_h.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_z_h.h
new file mode 100644
index 0000000..70dc295
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation_z_h.h
@@ -0,0 +1,63 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_GRUCELL_ACTIVATION_Z_H_H
+#define _VSI_NN_OP_GRUCELL_ACTIVATION_Z_H_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum {
+    GRUCELL_ACT_Z_H_HSTATE = 0,
+    GRUCELL_ACT_Z_H_I_FC_Z  = 1,
+    GRUCELL_ACT_Z_H_I_FC_H  = 2,
+    GRUCELL_ACT_Z_H_H_FC_Z  = 3,
+    GRUCELL_ACT_Z_H_H_FC_H  = 4,
+
+    GRUCELL_ACT_Z_H_IN_CNT,
+
+    GRUCELL_ACT_Z_H_OUT_OUTPUT = 0,
+    GRUCELL_ACT_Z_H_OUT_HSTATE = 1,
+
+    GRUCELL_ACT_Z_H_OUT_CNT
+};
+
+typedef struct _vsi_nn_grucell_activation_z_h_param
+{
+    struct _grucell_activation_z_h_local_data_t* local;
+    // Add parameters here
+    vsi_nn_activation_e activation;
+    vsi_nn_activation_e recurrent_activation;
+} vsi_nn_grucell_activation_z_h_param;
+_compiler_assert(offsetof(vsi_nn_grucell_activation_z_h_param, local) == 0, \
+    vsi_nn_grucell_activation_z_h_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_h_times_activation_r.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_h_times_activation_r.h
new file mode 100644
index 0000000..84695f2
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_h_times_activation_r.h
@@ -0,0 +1,47 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_GRUCELL_H_TIMES_ACTIVATION_R_H
+#define _VSI_NN_OP_GRUCELL_H_TIMES_ACTIVATION_R_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_grucell_h_times_activation_r_param
+{
+    struct _grucell_h_times_activation_r_local_data_t* local;
+
+    vsi_nn_activation_e recurrent_activation;
+} vsi_nn_grucell_h_times_activation_r_param;
+_compiler_assert(offsetof(vsi_nn_grucell_h_times_activation_r_param, local) == 0, \
+    vsi_nn_grucell_h_times_activation_r_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_ovxlib.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_ovxlib.h
index 6006952..d53ee6b 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_ovxlib.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_ovxlib.h
@@ -27,6 +27,10 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_op_grucell_ovxlib.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define GRUCELL_RZ_GATE_COUNT 2
 
 /* enum for inputs/outputs */
@@ -103,4 +107,8 @@ typedef struct _vsi_nn_grucell_ovxlib_param
 _compiler_assert(offsetof(vsi_nn_grucell_ovxlib_param, local) == 0, \
     vsi_nn_vsi_nn_grucell_ovxlib_h );
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_hard_sigmoid.h b/src/tim/vx/internal/include/ops/vsi_nn_op_hard_sigmoid.h
new file mode 100644
index 0000000..c16d04d
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_hard_sigmoid.h
@@ -0,0 +1,46 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_HARD_SIGMOID_H
+#define _VSI_NN_OP_HARD_SIGMOID_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_hard_sigmoid_param
+{
+    void* local;
+    // Add parameters here
+    float alpha;
+    float beta;
+} vsi_nn_hard_sigmoid_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_heatmap_max_keypoint.h b/src/tim/vx/internal/include/ops/vsi_nn_op_heatmap_max_keypoint.h
index d1bdf04..4da1c79 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_heatmap_max_keypoint.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_heatmap_max_keypoint.h
@@ -26,10 +26,17 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_heatmap_max_keypoint_param
 {
     vsi_enum     type;
 } vsi_nn_heatmap_max_keypoint_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_interp.h b/src/tim/vx/internal/include/ops/vsi_nn_op_interp.h
index 5f1bfb2..5f52eb5 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_interp.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_interp.h
@@ -27,6 +27,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_interp_param
 {
     struct _interp_local_data_t* local;
@@ -38,7 +42,8 @@ typedef struct _vsi_nn_interp_param
     int32_t   pad_end; //padding at end of intput
 } vsi_nn_interp_param;
 
-
-
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_log.h b/src/tim/vx/internal/include/ops/vsi_nn_op_log.h
index 362f4da..8def574 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_log.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_log.h
@@ -26,6 +26,9 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 #define VSI_NN_LOG_SH_KERNEL_IDX(_INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \
     VSI_NN_LOG_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL,
@@ -113,6 +116,8 @@ typedef struct _vsi_nn_log_param
     vsi_nn_log_lcl_data local;
 } vsi_nn_log_param;
 
-
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_log_softmax.h b/src/tim/vx/internal/include/ops/vsi_nn_op_log_softmax.h
index 26f3baf..913b5ce 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_log_softmax.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_log_softmax.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define VSI_NN_LOGSOFTMAX_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \
     VSI_NN_LOGSOFTMAX_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL,
 
@@ -150,5 +154,8 @@ typedef struct _vsi_nn_log_softmax_param
     int32_t                     axis;
 } vsi_nn_log_softmax_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_lsh_projection.h b/src/tim/vx/internal/include/ops/vsi_nn_op_lsh_projection.h
index 099c645..6df03f5 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_lsh_projection.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lsh_projection.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef enum
     {
         VSI_NN_LSH_PROJECTION_SPARSE = 1,
@@ -37,5 +41,8 @@ typedef struct _vsi_nn_lsh_projection_param
         vsi_nn_lsh_projection_type_e type;
     } vsi_nn_lsh_projection_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_lstm_ovxlib.h b/src/tim/vx/internal/include/ops/vsi_nn_op_lstm_ovxlib.h
index cf0ed9f..29c8cd1 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_lstm_ovxlib.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lstm_ovxlib.h
@@ -27,6 +27,10 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_op_lstmunit.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum
 {
     LSTM_INPUT_INPUT        = 0,
@@ -100,5 +104,8 @@ typedef struct _vsi_nn_lstm_ovxlib_param
     uint32_t weights; /* compatible with LSTM, NOT used */
 } vsi_nn_lstm_ovxlib_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_activation.h b/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_activation.h
index 08a9254..fa1389b 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_activation.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_activation.h
@@ -27,6 +27,10 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_op_lstmunit.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* c -> cifg, l -> layer norm, p -> projection, h -> peephole, b -> hybrid bias fp32, s -> standard*/
 
 enum {
@@ -96,5 +100,8 @@ typedef struct _vsi_nn_lstmunit_activation_param
     vsi_nn_activation_e recurrent_activation;
 } vsi_nn_lstmunit_activation_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_ovxlib.h b/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_ovxlib.h
index eaac01d..cc53d4c 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_ovxlib.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_lstmunit_ovxlib.h
@@ -28,6 +28,10 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_op_lstmunit.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define LSTMUNIT_IFCO_GATE_COUNT 4
 
 /* enum for inputs/outputs */
@@ -274,4 +278,8 @@ typedef struct _vsi_nn_lstmunit_ovxlib_param
     vsi_nn_dtype_t *internal_dtype_aux;
 } vsi_nn_lstmunit_ovxlib_param;
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_neg.h b/src/tim/vx/internal/include/ops/vsi_nn_op_neg.h
index ea85174..5ede2ad 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_neg.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_neg.h
@@ -26,6 +26,9 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 #define _VSI_NN_ELU_LOCAL_TENSOR_NUM 2
 
@@ -34,5 +37,8 @@ typedef struct _vsi_nn_neg_param
     vx_tensor   local_tensor[_VSI_NN_ELU_LOCAL_TENSOR_NUM];
 } vsi_nn_neg_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_nms.h b/src/tim/vx/internal/include/ops/vsi_nn_op_nms.h
index 174bb10..9f4b18c 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_nms.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_nms.h
@@ -27,6 +27,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_nms_param
 {
     int32_t max_output_size;
@@ -35,4 +39,8 @@ typedef struct _vsi_nn_nms_param
     float soft_nms_sigma;
 } vsi_nn_nms_param;
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_one_hot.h b/src/tim/vx/internal/include/ops/vsi_nn_op_one_hot.h
index 5cad574..28f3c64 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_one_hot.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_one_hot.h
@@ -27,6 +27,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_one_hot_param
 {
     struct _one_hot_local_data_t* local;
@@ -39,4 +43,8 @@ typedef struct _vsi_nn_one_hot_param
 _compiler_assert(offsetof(vsi_nn_one_hot_param, local) == 0, \
     vsi_nn_one_hot_h );
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_post_process.h b/src/tim/vx/internal/include/ops/vsi_nn_op_post_process.h
index 3ffe93f..160bc06 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_post_process.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_post_process.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum
 {
     POST_PROCESS_INPUT = 0,
@@ -53,5 +57,8 @@ typedef struct _vsi_nn_post_process_param
     vsi_nn_post_process_lcl_data local;
 } vsi_nn_post_process_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h
index 035320a..3f61413 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h
@@ -27,6 +27,10 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_pre_post_process.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef  vsi_nn_preprocess_source_format_e vsi_nn_pre_process_type_e;
 
 enum
@@ -80,5 +84,9 @@ typedef struct _vsi_nn_pre_process_param
 
     vsi_nn_pre_process_lcl_data *local;
 } vsi_nn_pre_process_param;
+
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h
index 6b7add6..d01fba8 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_pre_process_bgra_lcl_data
 {
     int32_t scale_x;
@@ -65,5 +69,8 @@ typedef struct _vsi_nn_pre_process_bgra_param
     vsi_nn_pre_process_bgra_lcl_data local;
 } vsi_nn_pre_process_bgra_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_gray.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_gray.h
index 459e25d..604184f 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_gray.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_gray.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum
 {
     PRE_PROCESS_GRAY_INPUT = 0,
@@ -67,5 +71,8 @@ typedef struct _vsi_nn_pre_process_gray_param
     vsi_nn_pre_process_gray_lcl_data local;
 } vsi_nn_pre_process_gray_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h
index 63e9335..da52fa0 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h
@@ -26,6 +26,9 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 enum
 {
@@ -77,5 +80,8 @@ typedef struct _vsi_nn_pre_process_rgb_param
     vsi_nn_pre_process_rgb_lcl_data local;
 } vsi_nn_pre_process_rgb_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_tensor.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_tensor.h
index b70094f..efe64e4 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_tensor.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_tensor.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum
 {
     PRE_PROCESS_TENSOR_INPUT = 0,
@@ -53,5 +57,8 @@ typedef struct _vsi_nn_pre_process_tensor_param
     vsi_nn_pre_process_tensor_lcl_data local;
 } vsi_nn_pre_process_tensor_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_quantized_16bit_lstm.h b/src/tim/vx/internal/include/ops/vsi_nn_op_quantized_16bit_lstm.h
index ec127d2..8e178d6 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_quantized_16bit_lstm.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_quantized_16bit_lstm.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum
 {
     Q16_LSTM_INPUT_INPUT        = 0,
@@ -60,5 +64,8 @@ typedef struct _vsi_nn_quantized_16bit_lstm_param
     void* local;
 } vsi_nn_quantized_16bit_lstm_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_random_multinomial.h b/src/tim/vx/internal/include/ops/vsi_nn_op_random_multinomial.h
index 34b7769..cd862a6 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_random_multinomial.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_random_multinomial.h
@@ -26,10 +26,17 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_random_multinomial_param
 {
     int32_t     sample_num;
 } vsi_nn_random_multinomial_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reduceall_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reduceall_internal.h
index 24cca15..0df389e 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_reduceall_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reduceall_internal.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define VSI_NN_REDUCEALL_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \
     VSI_NN_REDUCEALL_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL,
 
@@ -68,5 +72,8 @@ typedef struct _vsi_nn_reduceall_internal_param
     vx_bool     keep_dim;
 } vsi_nn_reduceall_internal_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reduceany_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reduceany_internal.h
index a316c82..babdb69 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_reduceany_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reduceany_internal.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define VSI_NN_REDUCEANY_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \
     VSI_NN_REDUCEANY_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL,
 
@@ -68,5 +72,8 @@ typedef struct _vsi_nn_reduceany_internal_param
     vx_bool     keep_dim;
 } vsi_nn_reduceany_internal_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reducemax_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reducemax_internal.h
index 9219983..b2ff2cb 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_reducemax_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reducemax_internal.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define VSI_NN_REDUCEMAX_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \
     VSI_NN_REDUCEMAX_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL,
 
@@ -113,5 +117,8 @@ typedef struct _vsi_nn_reducemax_internal_param
     vx_bool     keep_dim;
 } vsi_nn_reducemax_internal_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reducemin_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reducemin_internal.h
index ee32dd1..5f4ae52 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_reducemin_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reducemin_internal.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define VSI_NN_REDUCEMIN_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \
     VSI_NN_REDUCEMIN_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL,
 
@@ -113,5 +117,8 @@ typedef struct _vsi_nn_reducemin_internal_param
     vx_bool     keep_dim;
 } vsi_nn_reducemin_internal_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reduceprod_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reduceprod_internal.h
index b2c830d..2a7f8a7 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_reduceprod_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reduceprod_internal.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define VSI_NN_REDUCEPROD_SH_KERNEL_IDX(_AXIS, _INPUT_TYPE, _OUTPUT_TYPE, _IMAGE_DIMS) \
     VSI_NN_REDUCEPROD_AXIS##_AXIS##_##_INPUT_TYPE##TO##_OUTPUT_TYPE##_##_IMAGE_DIMS##_KERNEL,
 
@@ -118,5 +122,8 @@ typedef struct _vsi_nn_reduceprod_internal_param
     vx_bool     keep_dim;
 } vsi_nn_reduceprod_internal_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reducesum_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reducesum_internal.h
index 69ca355..337df79 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_reducesum_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reducesum_internal.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_reducesum_lcl_data_t
 {
     vsi_nn_tensor_t *reshaped_input;
@@ -40,5 +44,8 @@ typedef struct _vsi_nn_reducesum_internal_param
     vsi_nn_reducesum_lcl_data_t* local;
 } vsi_nn_reducesum_internal_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras.h b/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras.h
index fdc582d..02fb3ba 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_relu_keras_param
 {
     float     alpha;
@@ -33,5 +37,8 @@ typedef struct _vsi_nn_relu_keras_param
     float     threshold;
 } vsi_nn_relu_keras_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras_internal.h
index 4e30fb9..b7bccda 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_relu_keras_internal.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define _VSI_NN_RELU_KERAS_INTERNAL_LOCAL_TENSOR_NUM 2
 
 typedef struct _vsi_nn_relu_keras_internal_lcl_data
@@ -44,5 +48,8 @@ typedef struct _vsi_nn_relu_keras_internal_param
     float     threshold;
 } vsi_nn_relu_keras_internal_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reshape.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reshape.h
index a41377a..1b5ca0b 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_reshape.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reshape.h
@@ -37,7 +37,7 @@ typedef struct _vsi_nn_reshape_lcl_data
 
 typedef struct _vsi_nn_reshape_param
 {
-    const vsi_size_t * size;
+    const uint32_t * size;
     uint32_t dim_num;
 
     /* reshape layer local data structure */
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reshape2.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reshape2.h
new file mode 100644
index 0000000..863a0ff
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reshape2.h
@@ -0,0 +1,53 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_RESHAPE2_H
+#define _VSI_NN_OP_RESHAPE2_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_reshape2_local_data
+{
+    vsi_bool initialized;
+} vsi_nn_reshape2_local_data;
+
+typedef struct _vsi_nn_reshape2_param
+{
+    vsi_nn_reshape2_local_data* local;
+    // Add parameters here
+    const vsi_size_t * size;
+    uint32_t dim_num;
+} vsi_nn_reshape2_param;
+_compiler_assert(offsetof(vsi_nn_reshape2_param, local) == 0, \
+    vsi_nn_reshape2_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_resize.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize.h
index 50270a1..aaa72c6 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize.h
@@ -41,10 +41,15 @@ typedef uint32_t vsi_nn_interpolation_type_t; enum
     VSI_NN_INTERPOLATION_AREA
 };
 
-typedef struct _vsi_nn_resize_lcl_data
+typedef uint32_t vsi_nn_resize_layout_type_t; enum
 {
-    vx_tensor   local_tensor[_VSI_NN_RESIZE_LOCAL_TENSOR_NUM];
-} vsi_nn_resize_lcl_data;
+    VSI_NN_RESIZE_LAYOUT_NCHW = 0,
+    VSI_NN_RESIZE_LAYOUT_NHWC
+};
+
+typedef struct _vsi_nn_resize_local_data {
+    vsi_bool use_internal_node;
+} vsi_nn_resize_local_data;
 
 typedef struct _vsi_nn_resize_param
 {
@@ -53,9 +58,16 @@ typedef struct _vsi_nn_resize_param
     int32_t      size[2];
 
     /* resize layer local data structure */
-    vsi_nn_resize_lcl_data local;
+    union
+    {
+        vsi_nn_resize_local_data *lcl_data;
+        struct {
+            vx_tensor   local_tensor[_VSI_NN_RESIZE_LOCAL_TENSOR_NUM];
+        } reserved;
+    };
     vsi_bool    align_corners;
     vsi_bool    half_pixel_centers;
+    vsi_enum    layout;
 } vsi_nn_resize_param;
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d.h
index e85aa74..d996d04 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d.h
@@ -27,6 +27,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_resize_1d_param
 {
     struct _resize_1d_local_data_t* local;
@@ -40,5 +44,8 @@ typedef struct _vsi_nn_resize_1d_param
 _compiler_assert(offsetof(vsi_nn_resize_1d_param, local) == 0, \
     vsi_nn_resize_1d_h );
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_bilinear_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_bilinear_internal.h
index 4e119c8..6948db3 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_bilinear_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_bilinear_internal.h
@@ -27,6 +27,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_resize_1d_bilinear_internal_param
 {
     struct _resize_1d_bilinear_internal_local_data_t* local;
@@ -38,5 +42,8 @@ typedef struct _vsi_nn_resize_1d_bilinear_internal_param
 _compiler_assert(offsetof(vsi_nn_resize_1d_bilinear_internal_param, local) == 0, \
     vsi_nn_resize_1d_bilinear_internal_h );
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_nearest_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_nearest_internal.h
index cc94051..a18af71 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_nearest_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_1d_nearest_internal.h
@@ -27,6 +27,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_resize_1d_nearest_internal_param
 {
     struct _resize_1d_nearest_internal_local_data_t* local;
@@ -38,5 +42,8 @@ typedef struct _vsi_nn_resize_1d_nearest_internal_param
 _compiler_assert(offsetof(vsi_nn_resize_1d_nearest_internal_param, local) == 0, \
     vsi_nn_resize_1d_nearest_internal_h );
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_internal.h
index 578d943..6adc896 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_internal.h
@@ -26,6 +26,9 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 typedef struct _vsi_nn_resize_in_lcl_data
 {
@@ -38,8 +41,12 @@ typedef struct _vsi_nn_resize_internal_param
     vsi_nn_resize_in_lcl_data *lcl_data_ptr;
     vsi_bool    align_corners;
     vsi_bool    half_pixel_centers;
-    float        factor;
+    float       factor;
+    vsi_enum    layout;
 } vsi_nn_resize_internal_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_nearest_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_nearest_internal.h
index b700334..3f29b1c 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_nearest_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_nearest_internal.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_resize_nearest_in_lcl_data
 {
     uint32_t    hash_idx;
@@ -40,6 +44,8 @@ typedef struct _vsi_nn_resize_nearest_internal_param
     float        factor;
 } vsi_nn_resize_nearest_internal_param;
 
-
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_rnncell_ovxlib.h b/src/tim/vx/internal/include/ops/vsi_nn_op_rnncell_ovxlib.h
index 1f48989..b1da46d 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_rnncell_ovxlib.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_rnncell_ovxlib.h
@@ -27,6 +27,10 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_op_rnn.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_rnncell_ovxlib_lcl_data_t
 {
     vsi_bool multi_batch;
@@ -40,5 +44,8 @@ typedef struct _vsi_nn_rnncell_ovxlib_param
     vsi_nn_dtype_t* internal_dtype;
 } vsi_nn_rnncell_ovxlib_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_roi_align.h b/src/tim/vx/internal/include/ops/vsi_nn_op_roi_align.h
index e61a33b..e24f043 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_roi_align.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_roi_align.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_roi_align_param
 {
     int32_t output_height;
@@ -36,5 +40,8 @@ typedef struct _vsi_nn_roi_align_param
     int32_t width_sample_num;
 } vsi_nn_roi_align_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_sin.h b/src/tim/vx/internal/include/ops/vsi_nn_op_sin.h
index 719e520..3fa6d91 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_sin.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_sin.h
@@ -26,6 +26,9 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 #define _VSI_NN_SIN_LOCAL_TENSOR_NUM 2
 
@@ -42,5 +45,8 @@ typedef struct _vsi_nn_sin_param
     vsi_nn_sin_lcl_data local;
 } vsi_nn_sin_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_softmax_internal.h b/src/tim/vx/internal/include/ops/vsi_nn_op_softmax_internal.h
index 6e12636..1accf6d 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_softmax_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_softmax_internal.h
@@ -28,6 +28,10 @@
 #include "vsi_nn_platform.h"
 #include "utils/vsi_nn_link_list.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_softmax_internal_lcl_data
 {
     vsi_nn_link_list_t link_list;
@@ -40,7 +44,11 @@ typedef struct _vsi_nn_softmax_internal_param
 {
     vsi_nn_softmax_internal_lcl_data *data;
     float beta;
+    int32_t axis;
 } vsi_nn_softmax_internal_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_squeeze.h b/src/tim/vx/internal/include/ops/vsi_nn_op_squeeze.h
index 249ce2a..f28bfb4 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_squeeze.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_squeeze.h
@@ -27,6 +27,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_squeeze_param
 {
     // Add parameters here
@@ -34,5 +38,8 @@ typedef struct _vsi_nn_squeeze_param
     vx_uint32   axis_num;
 } vsi_nn_squeeze_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_stack.h b/src/tim/vx/internal/include/ops/vsi_nn_op_stack.h
index c75702a..cfd6d2b 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_stack.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_stack.h
@@ -25,6 +25,11 @@
 #define _VSI_NN_OP_STACK_H
 
 #include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define VSI_NN_STACK_MAX_INPUTS (16)
 
 typedef struct _vsi_nn_stack_lcl_data
@@ -63,5 +68,8 @@ typedef struct _vsi_nn_stack_param
     uint32_t axis;
 } vsi_nn_stack_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_tensor_add_mean_stddev_norm.h b/src/tim/vx/internal/include/ops/vsi_nn_op_tensor_add_mean_stddev_norm.h
index 3ce2d49..b60faff 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_tensor_add_mean_stddev_norm.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_tensor_add_mean_stddev_norm.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define _VSI_NN_TENSORADD_MEANSTDNORM_LOCAL_TENSOR_NUM 3
 
 typedef struct _vsi_nn_tensoradd_meanstdnorm_lcl_data
@@ -39,5 +43,8 @@ typedef struct _vsi_nn_tensor_add_mean_stddev_norm_param
     float eps;
 } vsi_nn_tensor_add_mean_stddev_norm_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_tile.h b/src/tim/vx/internal/include/ops/vsi_nn_op_tile.h
index 258d696..c885d3c 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_tile.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_tile.h
@@ -26,6 +26,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define _VSI_NN_TILE_LOCAL_TENSOR_NUM 2
 
 typedef struct _vsi_nn_tile_lcl_data_t
@@ -43,5 +47,8 @@ typedef struct _vsi_nn_tile_param
     uint32_t multiples_num;
 } vsi_nn_tile_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h b/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h
index 11fc2c4..7ab6ff2 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_topk.h
@@ -26,10 +26,17 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_topk_param
 {
     uint32_t     k;
 } vsi_nn_topk_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_unidirectional_sequence_rnn.h b/src/tim/vx/internal/include/ops/vsi_nn_op_unidirectional_sequence_rnn.h
index a7281f9..985fe22 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_unidirectional_sequence_rnn.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_unidirectional_sequence_rnn.h
@@ -27,6 +27,10 @@
 #include "vsi_nn_types.h"
 #include "vsi_nn_op_rnn.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* enum for inputs/outputs */
 enum
 {
@@ -48,5 +52,8 @@ typedef struct _vsi_nn_unidirectional_sequence_rnn_param
     vsi_nn_dtype_t internal_dtype[RNNCELL_QUANTIZE_PARAM_COUNT];
 } vsi_nn_unidirectional_sequence_rnn_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_unstack.h b/src/tim/vx/internal/include/ops/vsi_nn_op_unstack.h
index 1ee8220..14360a6 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_unstack.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_unstack.h
@@ -25,6 +25,11 @@
 #define _VSI_NN_OP_UNSTACK_H
 
 #include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define VSI_NN_UNSTACK_MAX_OUTPUTS (16)
 
 typedef struct _vsi_nn_unstack_lcl_data
@@ -39,5 +44,8 @@ typedef struct _vsi_nn_unstack_param
     uint32_t     axis;
 } vsi_nn_unstack_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_upsamplescale.h b/src/tim/vx/internal/include/ops/vsi_nn_op_upsamplescale.h
index f790da2..70f6eae 100644
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_upsamplescale.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_upsamplescale.h
@@ -27,6 +27,10 @@
 
 #include "vsi_nn_types.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _vsi_nn_upsamplescale_param
 {
     struct _upsamplescale_local_data_t* local;
@@ -35,5 +39,8 @@ typedef struct _vsi_nn_upsamplescale_param
     float scale;
 } vsi_nn_upsamplescale_param;
 
+#ifdef __cplusplus
+}
 #endif
 
+#endif
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h b/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h
index a491adc..d7e5983 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_constraint_check.h
@@ -50,7 +50,9 @@ enum {
     D_F32 = VSI_NN_TYPE_FLOAT32,
     D_F64 = VSI_NN_TYPE_FLOAT64,
     D_BF16 = VSI_NN_TYPE_BFLOAT16,
-    D_BOOL8 = VSI_NN_TYPE_BOOL8
+    D_BOOL8 = VSI_NN_TYPE_BOOL8,
+    D_I4 = VSI_NN_TYPE_INT4,
+    D_U4 = VSI_NN_TYPE_UINT4
 };
 
 /* short alias for qtype */
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util.h b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util.h
index 973f2ac..f575892 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util.h
@@ -72,6 +72,16 @@ OVXLIB_API uint32_t vsi_nn_TypeGetBytes
     const vsi_nn_type_e type
     );
 
+OVXLIB_API uint32_t vsi_nn_TypeGetBytesExt
+    (
+    const vsi_nn_type_e type
+    );
+
+OVXLIB_API uint32_t vsi_nn_TypeGetBits
+    (
+    const vsi_nn_type_e type
+    );
+
 OVXLIB_API uint16_t vsi_nn_Fp32ToFp16
     (
     float in
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
index f017fe3..4586fa8 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
@@ -41,10 +41,12 @@ static inline vsi_bool type_is_integer
     ret = FALSE;
     switch( type )
     {
+    case VSI_NN_TYPE_INT4:
     case VSI_NN_TYPE_INT8:
     case VSI_NN_TYPE_INT16:
     case VSI_NN_TYPE_INT32:
     case VSI_NN_TYPE_INT64:
+    case VSI_NN_TYPE_UINT4:
     case VSI_NN_TYPE_UINT8:
     case VSI_NN_TYPE_UINT16:
     case VSI_NN_TYPE_UINT32:
@@ -67,6 +69,7 @@ static inline vsi_bool type_is_signed
     ret = FALSE;
     switch( type )
     {
+    case VSI_NN_TYPE_INT4:
     case VSI_NN_TYPE_INT8:
     case VSI_NN_TYPE_INT16:
     case VSI_NN_TYPE_INT32:
@@ -112,6 +115,38 @@ static inline uint32_t type_get_bytes
     }
 } /* type_get_bytes() */
 
+static inline uint32_t type_get_bits
+    (
+    const vsi_nn_type_e type
+    )
+{
+    switch( type )
+    {
+    case VSI_NN_TYPE_INT4:
+    case VSI_NN_TYPE_UINT4:
+        return 4;
+    case VSI_NN_TYPE_INT8:
+    case VSI_NN_TYPE_UINT8:
+    case VSI_NN_TYPE_BOOL8:
+        return 8;
+    case VSI_NN_TYPE_INT16:
+    case VSI_NN_TYPE_UINT16:
+    case VSI_NN_TYPE_FLOAT16:
+    case VSI_NN_TYPE_BFLOAT16:
+        return 16;
+    case VSI_NN_TYPE_INT32:
+    case VSI_NN_TYPE_UINT32:
+    case VSI_NN_TYPE_FLOAT32:
+        return 32;
+    case VSI_NN_TYPE_INT64:
+    case VSI_NN_TYPE_UINT64:
+    case VSI_NN_TYPE_FLOAT64:
+        return 64;
+    default:
+        return 0;
+    }
+} /* type_get_bits() */
+
 static inline void type_get_range
     (
     vsi_nn_type_e type,
@@ -123,8 +158,8 @@ static inline void type_get_range
     double from, to;
     from = 0.0;
     to = 0.0;
-    bits = type_get_bytes( type ) * 8;
-    if( type_is_integer( type ) )
+    bits = type_get_bits( type );
+    if( type_is_integer( type ) || bits > 0)
     {
         if( type_is_signed( type ) )
         {
@@ -240,6 +275,14 @@ static inline vsi_status integer_convert
         uint32_t   src_sz = type_get_bytes( src_type );
         uint32_t   dest_sz = type_get_bytes( dest_type );
         uint8_t*   buffer = all_zeros;
+        if( src_sz == 0 )
+        {
+            src_sz = 1;
+        }
+        if( dest_sz == 0)
+        {
+            dest_sz = 1;
+        }
         if( type_is_signed( src_type ) && (((int8_t *)src)[src_sz - 1] & 0x80) )
         {
             buffer = all_ones;
@@ -384,6 +427,8 @@ static inline vsi_status dtype_to_float32
     case VSI_NN_TYPE_BFLOAT16:
         *dst = bfp16_to_fp32( *(int16_t *)src );
         break;
+    case VSI_NN_TYPE_INT4:
+    case VSI_NN_TYPE_UINT4:
     case VSI_NN_TYPE_INT8:
     case VSI_NN_TYPE_BOOL8:
     case VSI_NN_TYPE_UINT8:
@@ -397,6 +442,7 @@ static inline vsi_status dtype_to_float32
             case VSI_NN_QNT_TYPE_DFP:
                 *dst = dfp_to_fp32( src_value, src_dtype->fl, src_dtype->vx_type );
                 break;
+            case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
             case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
                 *dst = affine_to_fp32( src_value,
                     src_dtype->scale, src_dtype->zero_point, src_dtype->vx_type );
@@ -433,6 +479,8 @@ static inline vsi_status float32_to_dtype
     case VSI_NN_TYPE_BFLOAT16:
         *(int16_t *)dst = fp32_to_bfp16_rtne( src );
         break;
+    case VSI_NN_TYPE_INT4:
+    case VSI_NN_TYPE_UINT4:
     case VSI_NN_TYPE_INT8:
     case VSI_NN_TYPE_BOOL8:
     case VSI_NN_TYPE_UINT8:
@@ -446,6 +494,7 @@ static inline vsi_status float32_to_dtype
             case VSI_NN_QNT_TYPE_DFP:
                 dst_value = fp32_to_dfp( src, dst_dtype->fl, dst_dtype->vx_type );
                 break;
+            case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
             case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
                 dst_value = fp32_to_affine( src,
                     dst_dtype->scale, dst_dtype->zero_point, dst_dtype->vx_type );
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_util.h b/src/tim/vx/internal/include/utils/vsi_nn_util.h
index 1c8e36d..7aa984e 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_util.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h
@@ -50,10 +50,13 @@ extern "C" {
 
 #define vsi_safe_release_tensor(_t) if(_t){vsi_nn_ReleaseTensor(&(_t)); _t = NULL;}
 
-#define END_OF_VARIADIC_ARGUMENTS       0xbadcaffe
+#define END_OF_VARIADIC_ARGUMENTS       ((size_t)0xbadcaffebadcaffe)
+
 #define FOREACH_ARGS(_args, _next, _arg_type) \
     while(((_arg_type)((size_t)END_OF_VARIADIC_ARGUMENTS)) != (_next = va_arg(_args, _arg_type)))
 
+#define BITS_PER_BYTE 8
+
 /*-------------------------------------------
                   Functions
 -------------------------------------------*/
@@ -242,6 +245,21 @@ OVXLIB_API const char* vsi_nn_DescribeStatus
     vsi_status status
     );
 
+OVXLIB_API vsi_status vsi_nn_Pack4bitData
+    (
+    vsi_nn_tensor_t * tensor,
+    uint8_t   * src,
+    uint8_t * dest
+    );
+
+OVXLIB_API vsi_status vsi_nn_Unpack4bitData
+    (
+    vsi_nn_tensor_t * tensor,
+    uint8_t   * src,
+    uint8_t * dest,
+    vsi_nn_type_e type
+    );
+
 vsi_size_t vsi_nn_compute_filter_shape
     (
     vsi_nn_pad_e padding_type,
@@ -261,6 +279,16 @@ void vsi_nn_compute_padding
     vsi_size_t   * out_pad
     );
 
+void vsi_nn_compute_padding_3d
+    (
+    const vsi_size_t   in_shape[3],
+    const vsi_size_t   ksize[3],
+    const uint32_t     stride[3],
+    const uint32_t     dilation[3],
+    const vsi_nn_pad_e pad_type,
+    vsi_size_t   out_pad[6]
+    );
+
 void vsi_nn_compute_padding_conv1d
     (
     vsi_size_t   * in_shape,
@@ -345,6 +373,31 @@ vsi_bool vsi_nn_is_same_type
     vsi_nn_tensor_t * src,
     vsi_nn_tensor_t * dst
     );
+
+vsi_bool vsi_nn_is_broadcast_operaton
+    (
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            *  output
+    );
+
+float vsi_nn_get_tensor_scale
+    (
+    vsi_nn_tensor_t * tensor
+    );
+
+int32_t vsi_nn_get_tensor_zero_point
+    (
+    vsi_nn_tensor_t * tensor
+    );
+
+void vsi_nn_get_tensor_clamp_min_max
+    (
+    vsi_nn_tensor_t * input,
+    float *clampMin,
+    float *clampMax
+    );
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h
index cfeb25b..4374441 100644
--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@@ -64,6 +64,8 @@ typedef struct _vsi_nn_hw_config_t
     uint32_t subGroupSize;
 #endif
     uint32_t use_40bits_va;
+    uint32_t support_stream_processor;
+    uint32_t sp_exec_count;
 } vsi_nn_hw_config_t;
 
 typedef struct _vsi_nn_runtime_option_t
diff --git a/src/tim/vx/internal/include/vsi_nn_feature_config.h b/src/tim/vx/internal/include/vsi_nn_feature_config.h
index 8906a96..db38ecc 100644
--- a/src/tim/vx/internal/include/vsi_nn_feature_config.h
+++ b/src/tim/vx/internal/include/vsi_nn_feature_config.h
@@ -1,3 +1,26 @@
+/****************************************************************************
+*
+*    Copyright (c) 2019 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the Software),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED AS IS, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
 /*****Auto generated header file, Please DO NOT modify manually!*****/
 #ifndef _VSI_NN_FEATURE_CONFIG_H
 #define _VSI_NN_FEATURE_CONFIG_H
diff --git a/src/tim/vx/internal/include/vsi_nn_log.h b/src/tim/vx/internal/include/vsi_nn_log.h
index cf9c04c..fd7d37a 100644
--- a/src/tim/vx/internal/include/vsi_nn_log.h
+++ b/src/tim/vx/internal/include/vsi_nn_log.h
@@ -46,7 +46,7 @@ typedef enum _vsi_nn_log_level_e
 
 #define VSI_NN_MAX_DEBUG_BUFFER_LEN 1024
 #define VSILOGE( fmt, ... ) \
-    vsi_nn_LogMsg(VSI_NN_LOG_ERROR, "E [%s:%d]" fmt, __FUNCTION__, __LINE__, ##__VA_ARGS__)
+    vsi_nn_LogMsg(VSI_NN_LOG_ERROR, "E [%s:%s:%d]" fmt, __FILE__, __FUNCTION__, __LINE__, ##__VA_ARGS__)
 #define VSILOGW( fmt, ... ) \
     vsi_nn_LogMsg(VSI_NN_LOG_WARN,  "W [%s:%d]" fmt, __FUNCTION__, __LINE__, ##__VA_ARGS__)
 #define VSILOGI( fmt, ... ) \
diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h
index a6830f6..0278c4b 100644
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@@ -177,6 +177,11 @@
 #include "ops/vsi_nn_op_gru.h"
 #include "ops/vsi_nn_op_grucell.h"
 #include "ops/vsi_nn_op_grucell_activation.h"
+#include "ops/vsi_nn_op_reshape2.h"
+#include "ops/vsi_nn_op_hard_sigmoid.h"
+#include "ops/vsi_nn_op_conv3d.h"
+#include "ops/vsi_nn_op_grucell_h_times_activation_r.h"
+#include "ops/vsi_nn_op_grucell_activation_z_h.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"
 
@@ -340,6 +345,11 @@ typedef union _vsi_nn_nn_param
     vsi_nn_gru_param                gru;
     vsi_nn_grucell_param            grucell;
     vsi_nn_grucell_activation_param grucell_activation;
+    vsi_nn_reshape2_param           reshape2;
+    vsi_nn_hard_sigmoid_param       hard_sigmoid;
+    vsi_nn_conv3d_param             conv3d;
+    vsi_nn_grucell_h_times_activation_r_param grucell_h_times_activation_r;
+    vsi_nn_grucell_activation_z_h_param grucell_activation_z_h;
     uint8_t                         client_param[128];
 
     /* custom node data struct define */
diff --git a/src/tim/vx/internal/include/vsi_nn_platform.h b/src/tim/vx/internal/include/vsi_nn_platform.h
index 6c00bd9..fc41e9f 100644
--- a/src/tim/vx/internal/include/vsi_nn_platform.h
+++ b/src/tim/vx/internal/include/vsi_nn_platform.h
@@ -24,6 +24,15 @@
 #ifndef _VSI_NN_PLATFORM_H
 #define _VSI_NN_PLATFORM_H
 
+#include "vsi_nn_feature_config.h"
+
+#ifdef VSI_40BIT_VA_SUPPORT
+#ifdef VX_VA40_EXT_SUPPORT
+#undef VX_VA40_EXT_SUPPORT
+#endif
+#define VX_VA40_EXT_SUPPORT 1
+#endif
+
 #include <VX/vx_khr_cnn.h>
 #include <VX/vx_helper.h>
 #include <VX/vx_ext_program.h>
diff --git a/src/tim/vx/internal/include/vsi_nn_post.h b/src/tim/vx/internal/include/vsi_nn_post.h
new file mode 100644
index 0000000..61fe75f
--- /dev/null
+++ b/src/tim/vx/internal/include/vsi_nn_post.h
@@ -0,0 +1,30 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+#ifndef _VSI_NN_POST_H
+#define _VSI_NN_POST_H
+
+#include "post/vsi_nn_post_fasterrcnn.h"
+#include "post/vsi_nn_post_cmupose.h"
+
+#endif
\ No newline at end of file
diff --git a/src/tim/vx/internal/include/vsi_nn_tensor.h b/src/tim/vx/internal/include/vsi_nn_tensor.h
index 846054f..7a33586 100644
--- a/src/tim/vx/internal/include/vsi_nn_tensor.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor.h
@@ -71,15 +71,17 @@ typedef enum
 typedef enum
 {
     /** none quantized */
-    VSI_NN_QNT_TYPE_NONE = 0,
+    VSI_NN_QNT_TYPE_NONE = 0x0,
     /** dynamic fixed point */
-    VSI_NN_QNT_TYPE_DFP = VX_QUANT_DYNAMIC_FIXED_POINT,
+    VSI_NN_QNT_TYPE_DFP = 0x1,
     /** affine asymmetric */
-    VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC = VX_QUANT_AFFINE_SCALE,
+    VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC = 0x2,
     /** affine perchannel symmetric */
-    VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC = 0x3,/*VX_QUANT_AFFINE_SCALE_PER_CHANNEL*/
+    VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC = 0x3,
     /** affine symmetric */
-    VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC = VX_QUANT_AFFINE_SCALE,
+    VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC = 0x4,
+    /** affine perchannel asymmetric */
+    VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC = 0x5,
     /** undefined type */
     VSI_NN_QNT_TYPE_NA = 0xff,
 } vsi_nn_qnt_type_e;
@@ -148,9 +150,11 @@ typedef struct vsi_nn_tensor_attr
 #ifdef VX_CREATE_TENSOR_SUPPORT_PHYSICAL
     vsi_memory_type_e vsi_memory_type;
 #endif
+#if VX_STREAM_PROCESSOR_SUPPORT
+    vsi_bool     is_dummy;
+#endif
 } vsi_nn_tensor_attr_t;
 
-
 /**
  * Tensor structure
  */
@@ -202,4 +206,3 @@ typedef struct _vsi_nn_tensor_rel
 #endif
 
 #endif
-
diff --git a/src/tim/vx/internal/include/vsi_nn_tensor_util.h b/src/tim/vx/internal/include/vsi_nn_tensor_util.h
index a88864d..1083d21 100644
--- a/src/tim/vx/internal/include/vsi_nn_tensor_util.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor_util.h
@@ -399,6 +399,14 @@ OVXLIB_API void vsi_nn_TransposeTensor
     vsi_size_t       * as_shape
     );
 
+vx_tensor vsi_nn_safe_reshape_tensor
+    (
+    vx_tensor         tensor,
+    void            * num_of_dims,
+    vsi_size_t        sizes,
+    vsi_size_t        size_of_shape_element
+    );
+
 OVXLIB_API void vsi_nn_PermuteTensor
     (
     vsi_nn_graph_t  * graph,
@@ -728,6 +736,13 @@ vsi_bool vsi_nn_ConvertTensor
     vsi_nn_tensor_t* output
     );
 
+vsi_nn_tensor_t * vsi_nn_dropout_tensor
+    (
+    vsi_nn_graph_t  * graph,
+    vsi_nn_tensor_t * input,
+    float             rate
+    );
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/tim/vx/internal/include/vsi_nn_types.h b/src/tim/vx/internal/include/vsi_nn_types.h
index 6e082f8..8aa3ca9 100644
--- a/src/tim/vx/internal/include/vsi_nn_types.h
+++ b/src/tim/vx/internal/include/vsi_nn_types.h
@@ -37,10 +37,6 @@ extern "C"{
 #define inline __inline
 #endif
 
-#if VX_VA40_EXT_SUPPORT
-#define VSI_40BIT_VA_SUPPORT
-#endif
-
 #if (defined(_MSC_VER) || defined(__MINGW32))
     #define SIZE_T_SPECIFIER "Iu"
     #define SSIZE_T_SPECIFIER "Id"
@@ -167,12 +163,20 @@ typedef enum
 #else
     VSI_NN_TYPE_BOOL8 = 0x011,
 #endif
+#ifdef VX_TENSOR_STRIDE_X_BITS_SUPPORT
+    VSI_NN_TYPE_INT4 = VX_TYPE_INT4,
+    VSI_NN_TYPE_UINT4 = VX_TYPE_UINT4,
+#else
+    VSI_NN_TYPE_INT4 = 0x012,
+    VSI_NN_TYPE_UINT4 = 0x013,
+#endif
 #ifdef VSI_BFLOAT16_SUPPORT
     VSI_NN_TYPE_BFLOAT16 = VX_TYPE_BFLOAT16,
 #else
     VSI_NN_TYPE_BFLOAT16 = 0x81A,
 #endif
     VSI_NN_TYPE_VDATA = VX_TYPE_USER_STRUCT_START + 0x1,
+
 }vsi_nn_type_e;
 
 typedef int32_t vsi_nn_activation_e; enum
diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h
index db3ba86..b0acac3 100644
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@@ -33,7 +33,7 @@ extern "C"{
 
 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 1
-#define VSI_NN_VERSION_PATCH 34
+#define VSI_NN_VERSION_PATCH 37
 #define VSI_NN_VERSION \
     (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
 
diff --git a/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c b/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c
index 8036d0e..6a84a5e 100644
--- a/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c
@@ -225,12 +225,12 @@ static vsi_nn_kernel_node_t _setup
     float   eps         = vsi_nn_kernel_param_get_float32( params, "eps" );
     float   rsEps       = (float)(1.0f / sqrtf(eps));
     float   dimRatio    = (float)(1.0f / (inputs[0]->attr.size[0]));
-    float   input0Scale = inputs[0]->attr.dtype.scale;
-    float   input0Tail  = (float)inputs[0]->attr.dtype.zero_point * input0Scale;
-    float   input1Scale = inputs[1]->attr.dtype.scale;
-    float   input1Tail  = (float)inputs[1]->attr.dtype.zero_point * input1Scale;
-    float   outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 0.0f : 1.0f / outputs[0]->attr.dtype.scale;
-    float   outputZP    = (float)outputs[0]->attr.dtype.zero_point;
+    float   input0Scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float   input0Tail  = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0Scale;
+    float   input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
+    float   input1Tail  = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale;
+    float   outputScale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
+    float   outputZP    = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
     int32_t width       = (int32_t)inputs[0]->attr.size[0];
 
     status = _query_kernel( kernel, inputs, outputs );
@@ -246,7 +246,7 @@ static vsi_nn_kernel_node_t _setup
             border.constant_value.U8 = 0;
             if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
             {
-                border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+                border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]);
             }
             status  = vsi_nn_kernel_node_set_border( node, &border );
             VSI_ASSERT( status == VSI_SUCCESS );
diff --git a/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c b/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c
index 2223eb9..31a5223 100644
--- a/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c
@@ -232,46 +232,12 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL};
     vsi_bool image_2d = FALSE;
     vsi_nn_kernel_node_t node = NULL;
-    float input_scale = 1.0f;
-    float input_tail = 0;
-    float output_scale = 1.0f;
-    float output_zp = (float)outputs[0]->attr.dtype.zero_point + 0.5f;
+    float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float input_tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input_scale;
+    float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
+    float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f;
     float eps = vsi_nn_kernel_param_get_float32(params, "eps");
 
-    if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC )
-    {
-        input_scale = inputs[0]->attr.dtype.scale;
-        input_tail = (float)inputs[0]->attr.dtype.zero_point * inputs[0]->attr.dtype.scale;
-    }
-    else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP )
-    {
-        if (inputs[0]->attr.dtype.fl > 0)
-        {
-            input_scale = (1.0f / ((float) ((int64_t)1 << inputs[0]->attr.dtype.fl)));
-        }
-        else
-        {
-            input_scale = ((float) ((int64_t)1 << -inputs[0]->attr.dtype.fl));
-        }
-    }
-
-    if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC )
-    {
-        output_scale = 1.0f / outputs[0]->attr.dtype.scale;
-        output_zp = (float)outputs[0]->attr.dtype.zero_point + 0.5f;
-    }
-    else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP )
-    {
-        if (outputs[0]->attr.dtype.fl > 0)
-        {
-            output_scale = (float) ((int64_t)1 << outputs[0]->attr.dtype.fl);
-        }
-        else
-        {
-            output_scale = ((float) 1.0f / ((int64_t)1 << -outputs[0]->attr.dtype.fl));
-        }
-    }
-
     if ( (inputs[1]->attr.is_const && inputs[2]->attr.is_const)
         || ( inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16
           && inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT32 )
diff --git a/src/tim/vx/internal/src/kernel/cl/clip_cl.c b/src/tim/vx/internal/src/kernel/cl/clip_cl.c
index c611991..cc62fab 100644
--- a/src/tim/vx/internal/src/kernel/cl/clip_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/clip_cl.c
@@ -240,10 +240,10 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_CLIP_PARAM_NUM] = {NULL};
     vsi_nn_kernel_node_t node = NULL;
     vsi_bool image_2d = FALSE;
-    float    outputScale  = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale;
-    float    outputTail   = (float)outputs[0]->attr.dtype.zero_point;
-    float    inputScale   = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale;
-    float    inputTail    = (float)inputs[0]->attr.dtype.zero_point;
+    float    outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float    outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float    inputScale   = vsi_nn_get_tensor_scale(inputs[0]);
+    float    inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
     vsi_bool is_use_u8_kernel = FALSE;
     float    min_value    = vsi_nn_kernel_param_get_float32( params, "min_value" );
     float    max_value    = vsi_nn_kernel_param_get_float32( params, "max_value" );
diff --git a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
index 80f2f95..4be70d9 100644
--- a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c
@@ -337,10 +337,10 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_t node = NULL;
     int32_t operation = 0;
 
-    float input0Scale = inputs[0]->attr.dtype.scale;
-    float input0Tail = (float)inputs[0]->attr.dtype.zero_point * input0Scale;
-    float input1Scale = inputs[1]->attr.dtype.scale;
-    float input1Tail = (float)inputs[1]->attr.dtype.zero_point * input1Scale;
+    float input0Scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0Scale;
+    float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
+    float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale;
 
     if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
diff --git a/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c b/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c
index fdf9b40..f34393e 100644
--- a/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c
@@ -239,29 +239,15 @@ static vsi_nn_kernel_node_t _setup
     float         inv_scale_h  = vsi_nn_kernel_param_get_float32( params, "inv_scale_h" );
     float         inv_scale_w  = vsi_nn_kernel_param_get_float32( params, "inv_scale_w" );
     vsi_bool      is_use_u8_kernel = FALSE;
-    float         input0Scale  = 1.0f;
-    float         input0Zp     = 0.0f;
-    float         input0Tail   = 0.0f;
-    float         input1Scale  = 1.0f;
-    float         input1Zp     = 0.0f;
-    float         input1Tail   = 0.0f;
+    float         input0Scale  = vsi_nn_get_tensor_scale(inputs[0]);
+    float         input0Zp     = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float         input0Tail   = -input0Zp * input0Scale;
+    float         input1Scale  = vsi_nn_get_tensor_scale(inputs[1]);
+    float         input1Zp     = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
+    float         input1Tail   = -input1Zp * input1Scale;
 
     status = _query_kernel( kernel, inputs, outputs, &is_use_u8_kernel );
 
-    if ( inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC )
-    {
-        input0Zp      = (float)inputs[0]->attr.dtype.zero_point;;
-        input0Scale   = inputs[0]->attr.dtype.scale;
-        input0Tail    = -input0Zp * input0Scale;
-    }
-
-    if ( inputs[1]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC )
-    {
-        input1Zp      = (float)inputs[1]->attr.dtype.zero_point;;
-        input1Scale   = inputs[1]->attr.dtype.scale;
-        input1Tail    = -input1Zp * input1Scale;
-    }
-
     if ( VSI_SUCCESS == status )
     {
         size_t node_params_num = _DETECT_POST_BOX_F32_PARAM_NUM;
diff --git a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
index 7dde9f8..5572007 100644
--- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c
@@ -196,6 +196,7 @@ static vx_param_description_t kernel_param_def[] =
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 
 #define SCALAR_INPUT_SCALE           (2)
@@ -203,6 +204,7 @@ static vx_param_description_t kernel_param_def[] =
 #define SCALAR_OUTPUT_SCALE          (4)
 #define SCALAR_OUTPUT_ZP             (5)
 #define SCALAR_ALPHA                 (6)
+#define SCALAR_BETA                  (7)
 #define _CL_PARAM_NUM          _cnt_of_array(kernel_param_def)
 
 /*
@@ -318,11 +320,12 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t new_rank = 0;
     vsi_bool ret;
 
-    float inputScale = inputs[0]->attr.dtype.scale;
-    float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale;
-    float outputScale = outputs[0]->attr.dtype.scale;
-    float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f;
+    float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
+    float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * inputScale;
+    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
+    float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f;
     float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" );
+    float beta = vsi_nn_kernel_param_get_float32( params, "beta" );
 
     ret = vsi_nn_kernel_optimize_element_shape(
             inputs[0]->attr.size, inputs[0]->attr.dim_num,
@@ -363,6 +366,8 @@ static vsi_nn_kernel_node_t _setup
                     graph, F32, &outputZP );
             node_params[SCALAR_ALPHA] = vsi_nn_kernel_scalar_create(
                     graph, F32, &alpha );
+            node_params[SCALAR_BETA] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &beta );
 
             /* Pass parameters to node. */
             status  = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
@@ -406,6 +411,11 @@ OnError:
         vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALPHA] );
     }
 
+    if (node_params[SCALAR_BETA])
+    {
+        vsi_nn_kernel_scalar_release( &node_params[SCALAR_BETA] );
+    }
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/cl/erf_cl.c b/src/tim/vx/internal/src/kernel/cl/erf_cl.c
index 1cd573b..d6ef8d8 100644
--- a/src/tim/vx/internal/src/kernel/cl/erf_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/erf_cl.c
@@ -238,10 +238,10 @@ static vsi_nn_kernel_node_t _setup
     vsi_bool ret = FALSE;
     vsi_bool image_2d = FALSE;
 
-    float inputScale = inputs[0]->attr.dtype.scale;
-    float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale;
-    float outputScale = outputs[0]->attr.dtype.scale;
-    float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f;
+    float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
+    float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * inputScale;
+    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
+    float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f;
 
     ret = vsi_nn_kernel_optimize_element_shape(
             inputs[0]->attr.size, inputs[0]->attr.dim_num,
diff --git a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c
index 4ceb1c2..1f0ba44 100644
--- a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c
@@ -187,12 +187,20 @@ static vsi_status _query_kernel
 
     if (F16 == in0_dtype)
     {
-        in0_dtype  = F32;
+        in0_dtype = F32;
+    }
+    else if (I16 == in0_dtype)
+    {
+        in0_dtype = I32;
     }
 
     if (F16 == in1_dtype)
     {
-        in1_dtype  = F32;
+        in1_dtype = F32;
+    }
+    else if (I16 == in1_dtype)
+    {
+        in1_dtype = I32;
     }
 
     if (F16 == out_dtype)
@@ -254,12 +262,12 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_FLOORDIV_PARAM_NUM] = {NULL};
     vsi_nn_kernel_node_t node = NULL;
     vsi_bool image_2d = FALSE;
-    float    outputScale  = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale;
-    float    outputTail   = (float)outputs[0]->attr.dtype.zero_point;
-    float    input0Scale  = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale;
-    float    input0Tail   = (float)inputs[0]->attr.dtype.zero_point;
-    float    input1Scale  = inputs[1]->attr.dtype.scale == 0.0f ? 1.0f : inputs[1]->attr.dtype.scale;
-    float    input1Tail   = (float)inputs[1]->attr.dtype.zero_point;
+    float    outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float    outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float    input0Scale  = vsi_nn_get_tensor_scale(inputs[0]);
+    float    input0Tail   = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float    input1Scale  = vsi_nn_get_tensor_scale(inputs[1]);
+    float    input1Tail   = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
     vsi_bool is_use_u8_kernel = FALSE;
 
     outputScale = 1.0f / outputScale;
diff --git a/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c
index af79d59..4612e4f 100644
--- a/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c
@@ -46,6 +46,7 @@ __BEGIN_DECLS
 
  typedef enum
 {
+    _error = -1,
     _1D = 0,
     _2D,
     _3D
@@ -142,6 +143,10 @@ static vsi_status cal_gather_nd_tensor_reshape_size
             sizes[0] = block_size;
             sizes[1] = elementCnt / block_size;
         }
+        else if(coordDim == 4)
+        {
+            newDim[0] = 3;
+        }
 
         status = VSI_SUCCESS;
     }
@@ -223,7 +228,7 @@ static vsi_status _query_kernel
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_dtype_e input0_dtype = U8;
     vsi_nn_kernel_dtype_e output_dtype = U8;
-    vsi_nn_kernel_coord_type_e coord_type = _1D;
+    vsi_nn_kernel_coord_type_e coord_type = _error;
     uint32_t key = 0;
     int i = 0;
 
@@ -237,7 +242,7 @@ static vsi_status _query_kernel
     {
         coord_type = _2D;
     }
-    else if(coord_dim == 3)
+    else if(coord_dim == 3 || coord_dim == 4)
     {
         coord_type = _3D;
     }
diff --git a/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c
index 52110c8..49ccd23 100644
--- a/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c
@@ -522,12 +522,10 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t width = inputs[0]->attr.size[0];
     vsi_size_t height = inputs[0]->attr.size[1];
     int32_t group_stride = 1;
-    float input_zp = 0;
-    float input_scale = 1.0f;
-    int32_t input_fl = 0;
-    float output_zp = 0;
-    float output_scale = 1.0f;
-    int32_t output_fl = 0;
+    float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
     float rSpaceOrg = 1.0f / (width * height);
     float group_ratio = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] * group_size);
 
@@ -549,44 +547,6 @@ static vsi_nn_kernel_node_t _setup
     height = is2D_flg > 0 ? 1 : new_shape[1];
     group_stride = (int32_t)(((width + 15) / 16) * 4);
 
-    if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
-    {
-        input_zp = (float)inputs[0]->attr.dtype.zero_point;
-        input_scale = inputs[0]->attr.dtype.scale;
-    }
-    else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
-    {
-        input_fl = inputs[0]->attr.dtype.fl;
-        if (input_fl > 0)
-        {
-            input_scale = (1.0f / ((float) ((int64_t)1 << input_fl)));
-        }
-        else
-        {
-            input_scale = ((float) ((int64_t)1 << -input_fl));
-        }
-        input_zp = 0.0f;
-    }
-
-    if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
-    {
-        output_zp = (float)outputs[0]->attr.dtype.zero_point;
-        output_scale = 1.0f / outputs[0]->attr.dtype.scale;
-    }
-    else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
-    {
-        output_fl = outputs[0]->attr.dtype.fl;
-        if (output_fl > 0)
-        {
-            output_scale = (float)((int64_t)1 << output_fl);
-        }
-        else
-        {
-            output_scale = (1.0f / (float)((int64_t)1 << -output_fl));
-        }
-        output_zp = 0.0f;
-    }
-
     for( i = 0; i < INTERNAL_KERNEL_SIZE; i ++ )
     {
         ikernels[i] = vsi_nn_kernel_create( VSI_NN_KERNEL_TYPE_CL );
@@ -757,4 +717,3 @@ final:
 __END_DECLS
 
 REGISTER_BACKEND_CL( group_norm, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c
new file mode 100644
index 0000000..e2b6964
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_activation_z_h_cl.c
@@ -0,0 +1,281 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum _grucell_nn_activation_type_e
+{
+    SIGMOID = VSI_NN_ACT_SIGMOID,
+    HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
+}grucell_nn_activation_type_e;
+
+#define _GRUCELL_ACTIVATION_Z_H_KERNEL_SOURCE      "grucell_activation_z_h"
+
+// Add kernel hashtable here
+#define GRUCELL_ACTIVATION_Z_H_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \
+        (( HSTATE_DTYPE ) | ( IN_FC_DTYPE << 6 ) | ( OUT_TYPE << 12 ) | ( REC_ACT << 18 ))
+
+#define PACK_KERNEL_MAP( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \
+    { GRUCELL_ACTIVATION_Z_H_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ), \
+      CVIVANTE_NAMESPACE("cl.grucell_activation_z_h_"#HSTATE_DTYPE"_"#IN_FC_DTYPE"to"#OUT_TYPE"_"#REC_ACT), \
+      _GRUCELL_ACTIVATION_Z_H_KERNEL_SOURCE }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _grucell_activation_z_h_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( U8,  F32, U8,  SIGMOID ),
+    PACK_KERNEL_MAP( I32, F32, I32, SIGMOID ),
+    PACK_KERNEL_MAP( F32, F32, F32, SIGMOID ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _grucell_activation_z_h_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _GRUCELL_ACTIVATION_Z_H_PARAM_NUM  _cnt_of_array( _grucell_activation_z_h_kernel_param_def )
+#define SCALAR_INPUT_SCALE      (7)
+#define SCALAR_INPUT_TAIL       (8)
+#define SCALAR_OUTPUT_SCALE     (9)
+#define SCALAR_OUTPUT_ZP        (10)
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        2,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_status                   status     = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t       input      = NULL;
+    vsi_nn_kernel_tensor_attr_t* input_attr = NULL;
+
+    input = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_Z_H_HSTATE];
+
+    input_attr = vsi_nn_kernel_tensor_attr_create( input );
+    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_size[0]   = gpu_align_p2((input_attr->shape->data[0] + gpu_param.global_scale[0] - 1)
+        / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1]   = (input_attr->shape->data[1] + gpu_param.global_scale[1] - 1)
+        / gpu_param.global_scale[1];
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+final:
+
+    if (input_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release( &input_attr );
+    }
+
+    return status;
+} /* _grucell_activation_z_h_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t  recurrent_activation
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e hstate_dtype;
+    vsi_nn_kernel_dtype_e fc_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _grucell_activation_z_h_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _grucell_activation_z_h_kernel_map );
+    vx_param_description_t * param_def  = _grucell_activation_z_h_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _grucell_activation_z_h_initializer;
+
+    uint32_t key = 0;
+    uint32_t i = 0;
+
+    hstate_dtype  = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_H_STATE]->attr.dtype.vx_type );
+    fc_dtype  = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_I_FC_Z]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dtype.vx_type );
+
+    if (F16 == hstate_dtype)
+    {
+        hstate_dtype = F32;
+    }
+    else if (I8 == hstate_dtype || I16 == hstate_dtype)
+    {
+        hstate_dtype = I32;
+    }
+
+    if (F16 == fc_dtype)
+    {
+        fc_dtype = F32;
+    }
+
+    if (F16 == out_dtype)
+    {
+        out_dtype = F32;
+    }
+    else if (I8 == out_dtype || I16 == out_dtype)
+    {
+        out_dtype = I32;
+    }
+
+    key = GRUCELL_ACTIVATION_Z_H_HASH_KEY( hstate_dtype, fc_dtype, out_dtype, recurrent_activation );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _grucell_activation_z_h_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_GRUCELL_ACTIVATION_Z_H_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t activation = vsi_nn_kernel_param_get_int32( params, "activation" );
+    int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" );
+    float input_scale = vsi_nn_get_tensor_scale(inputs[GRUCELL_ACT_Z_H_HSTATE]);
+    float input_tail = -(float)vsi_nn_get_tensor_zero_point(inputs[GRUCELL_ACT_Z_H_HSTATE]) * input_scale;
+    float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]);
+    float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]);
+
+    if( activation != VSI_NN_ACT_TANH )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, recurrent_activation );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_ACTIVATION_Z_H_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &input_scale );
+            node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &input_tail );
+            node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &output_scale );
+            node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &output_zp );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_ACTIVATION_Z_H_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( grucell_activation_z_h, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_h_times_activation_r_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_h_times_activation_r_cl.c
new file mode 100644
index 0000000..3912b95
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_h_times_activation_r_cl.c
@@ -0,0 +1,259 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum _grucell_nn_activation_type_e
+{
+    SIGMOID = VSI_NN_ACT_SIGMOID,
+    HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
+} grucell_nn_activation_type_e;
+
+#define _GRUCELL_H_TIMES_ACTIVATION_R_KERNEL_SOURCE      "grucell_h_times_activation_r"
+#define _GRUCELL_H_TIMES_ACTIVATION_R_KERNEL_NAME        CVIVANTE_NAMESPACE("cl.grucell_h_times_activation_r")
+
+// Add kernel hashtable here
+#define GRUCELL_H_TIMES_ACTIVATION_R_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \
+        (( HSTATE_DTYPE ) | ( IN_FC_DTYPE << 6 ) | ( OUT_TYPE << 12 ) | ( REC_ACT << 18 ))
+#define PACK_KERNEL_MAP( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \
+      { GRUCELL_H_TIMES_ACTIVATION_R_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ), \
+        CVIVANTE_NAMESPACE("cl.grucell_h_times_activation_r_"#HSTATE_DTYPE"_"#IN_FC_DTYPE"to"#OUT_TYPE"_"#REC_ACT), \
+        _GRUCELL_H_TIMES_ACTIVATION_R_KERNEL_SOURCE }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _grucell_h_times_activation_r_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( U8,  F32, F32, SIGMOID ),
+    PACK_KERNEL_MAP( I32, F32, F32, SIGMOID ),
+    PACK_KERNEL_MAP( F32, F32, F32, SIGMOID ),
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _grucell_h_times_activation_r_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM  _cnt_of_array( _grucell_h_times_activation_r_kernel_param_def )
+#define SCALAR_INPUT_SCALE      (4)
+#define SCALAR_INPUT_TAIL       (5)
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_grucell_h_times_activation_r_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        2,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_status                   status                 = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t       output                 = NULL;
+    vsi_nn_kernel_tensor_attr_t* output_attr;
+
+    output = (vsi_nn_kernel_tensor_t)param[3];
+
+    output_attr = vsi_nn_kernel_tensor_attr_create( output );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_size[0]   = gpu_align_p2((output_attr->shape->data[0] + gpu_param.global_scale[0] - 1)
+        / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1]   = (output_attr->shape->data[1] + gpu_param.global_scale[1] - 1)
+        / gpu_param.global_scale[1];
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+final:
+
+    if (output_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release( &output_attr );
+    }
+
+    return status;
+} /* _grucell_h_times_activation_r_initializer() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t  recurrent_activation
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e hstate_dtype;
+    vsi_nn_kernel_dtype_e fc_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _grucell_h_times_activation_r_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _grucell_h_times_activation_r_kernel_map );
+    vx_param_description_t * param_def  = _grucell_h_times_activation_r_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _grucell_h_times_activation_r_initializer;
+
+    uint32_t key = 0;
+    uint32_t i = 0;
+
+    hstate_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    fc_dtype  = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (F16 == hstate_dtype)
+    {
+        hstate_dtype = F32;
+    }
+    else if (I8 == hstate_dtype || I16 == hstate_dtype)
+    {
+        hstate_dtype = I32;
+    }
+
+    if (F16 == fc_dtype)
+    {
+        fc_dtype = F32;
+    }
+
+    if (F16 == out_dtype)
+    {
+        out_dtype = F32;
+    }
+    else if (I8 == out_dtype || I16 == out_dtype)
+    {
+        out_dtype = I32;
+    }
+
+    key = GRUCELL_H_TIMES_ACTIVATION_R_HASH_KEY( hstate_dtype, fc_dtype, out_dtype, recurrent_activation );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _grucell_h_times_activation_r_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" );
+    float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float input_tail = -(float)vsi_nn_get_tensor_zero_point(inputs[0]) * input_scale;
+
+    status = _query_kernel( kernel, inputs, outputs, recurrent_activation );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &input_scale );
+            node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &input_tail );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( grucell_h_times_activation_r, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c b/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c
new file mode 100644
index 0000000..a18b112
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/grucell_reset_after_activation_cl.c
@@ -0,0 +1,282 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum _grucell_nn_activation_type_e
+{
+    SIGMOID = VSI_NN_ACT_SIGMOID,
+    HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
+}grucell_nn_activation_type_e;
+
+#define _GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE      "grucell_reset_after_activation"
+
+// Add kernel hashtable here
+#define GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \
+        (( HSTATE_DTYPE ) | ( IN_FC_DTYPE << 6 ) | ( OUT_TYPE << 12 ) | ( REC_ACT << 18 ))
+#define PACK_KERNEL_MAP( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \
+        { GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ), \
+        CVIVANTE_NAMESPACE("cl.grucell_reset_after_activation_"#HSTATE_DTYPE"_"#IN_FC_DTYPE"to"#OUT_TYPE"_"#REC_ACT), \
+          _GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _grucell_reset_after_activation_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( U8,  F32, U8,  SIGMOID ),
+    PACK_KERNEL_MAP( I32, F32, I32, SIGMOID ),
+    PACK_KERNEL_MAP( F32, F32, F32, SIGMOID ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _grucell_reset_after_activation_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    // Add kererl parameters here
+};
+#define _GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM  _cnt_of_array( _grucell_reset_after_activation_kernel_param_def )
+#define SCALAR_INPUT_SCALE      (9)
+#define SCALAR_INPUT_TAIL       (10)
+#define SCALAR_OUTPUT_SCALE     (11)
+#define SCALAR_OUTPUT_ZP        (12)
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_grucell_reset_after_activation_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        2,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_status                   status                 = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t       input      = NULL;
+    vsi_nn_kernel_tensor_attr_t* input_attr = NULL;
+
+    input = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_H_STATE];
+
+    input_attr = vsi_nn_kernel_tensor_attr_create( input );
+    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
+
+    gpu_param.global_scale[0]  = 1;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_size[0]   = gpu_align_p2((input_attr->shape->data[0] + gpu_param.global_scale[0] - 1)
+        / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1]   = (input_attr->shape->data[1] + gpu_param.global_scale[1] - 1)
+        / gpu_param.global_scale[1];
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+final:
+
+    if (input_attr)
+    {
+        vsi_nn_kernel_tensor_attr_release( &input_attr );
+    }
+
+    return status;
+} /* _grucell_reset_after_activation_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t  recurrent_activation
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e hstate_dtype;
+    vsi_nn_kernel_dtype_e fc_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _grucell_reset_after_activation_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _grucell_reset_after_activation_kernel_map );
+    vx_param_description_t * param_def  = _grucell_reset_after_activation_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _grucell_reset_after_activation_initializer;
+
+    uint32_t key = 0;
+    uint32_t i = 0;
+
+    hstate_dtype  = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_H_STATE]->attr.dtype.vx_type );
+    fc_dtype  = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_I_FC_Z]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dtype.vx_type );
+
+    if (F16 == hstate_dtype)
+    {
+        hstate_dtype = F32;
+    }
+    else if (I8 == hstate_dtype || I16 == hstate_dtype)
+    {
+        hstate_dtype = I32;
+    }
+
+    if (F16 == fc_dtype)
+    {
+        fc_dtype = F32;
+    }
+
+    if (F16 == out_dtype)
+    {
+        out_dtype = F32;
+    }
+    else if (I8 == out_dtype || I16 == out_dtype)
+    {
+        out_dtype = I32;
+    }
+
+    key = GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( hstate_dtype, fc_dtype, out_dtype, recurrent_activation );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _grucell_reset_after_activation_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 1,
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t activation = vsi_nn_kernel_param_get_int32( params, "activation" );
+    int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" );
+    float input_scale = vsi_nn_get_tensor_scale(inputs[GRUCELL_ACT_H_STATE]);
+    float input_tail = -(float)vsi_nn_get_tensor_zero_point(inputs[GRUCELL_ACT_H_STATE]) * input_scale;
+    float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[GRUCELL_ACT_OUT_OUTPUT]);
+    float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[GRUCELL_ACT_OUT_OUTPUT]);
+
+    if( activation != VSI_NN_ACT_TANH )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, recurrent_activation );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &input_scale );
+            node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &input_tail );
+            node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &output_scale );
+            node_params[SCALAR_OUTPUT_ZP] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &output_zp );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_ZP] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( grucell_reset_after_activation, _setup )
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
index 8b78ced..929c812 100644
--- a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c
@@ -413,53 +413,13 @@ static vsi_nn_kernel_node_t _setup
     size_t width = inputs[0]->attr.size[0];
     size_t height = inputs[0]->attr.size[1];
     int32_t group_num = (int32_t)(width + 15) / 16;
-    int32_t input_zp = 0;
-    float input_scale = 1.0f;
-    int32_t input_fl = 0;
-    int32_t output_zp = 0;
-    float output_scale = 1.0f;
-    int32_t output_fl = 0;
+    int32_t input_zp = vsi_nn_get_tensor_zero_point(inputs[0]);
+    float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    int32_t output_zp = vsi_nn_get_tensor_zero_point(outputs[0]);
+    float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
     float in_fl_scale = 1.0f, out_fl_scale = 1.0;
     float dim_ratio = (float)1.0 / (float)(width * height);
 
-    if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
-    {
-        input_zp = inputs[0]->attr.dtype.zero_point;
-        input_scale = inputs[0]->attr.dtype.scale;
-    }
-    else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
-    {
-        input_fl = inputs[0]->attr.dtype.fl;
-        if (input_fl > 0)
-        {
-            in_fl_scale = (1.0f / ((float) ((int64_t)1 << input_fl)));
-        }
-        else
-        {
-            in_fl_scale = ((float) ((int64_t)1 << -input_fl));
-        }
-        input_zp = 0;
-    }
-
-    if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
-    {
-        output_zp = outputs[0]->attr.dtype.zero_point;
-        output_scale = 1.0f / outputs[0]->attr.dtype.scale;
-    }
-    else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
-    {
-        output_fl = outputs[0]->attr.dtype.fl;
-        if (output_fl > 0)
-        {
-            out_fl_scale = (float)((int64_t)1 << output_fl);
-        }
-        else
-        {
-            out_fl_scale = (1.0f / (float)((int64_t)1 << -output_fl));
-        }
-        output_zp = 0;
-    }
-
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
     {
@@ -674,4 +634,3 @@ final:
 __END_DECLS
 
 REGISTER_BACKEND_CL( instance_norm, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c b/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c
index 2250a8d..e516df5 100644
--- a/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c
@@ -259,10 +259,10 @@ static vsi_nn_kernel_node_t _setup
     vsi_bool image_2d = FALSE;
     int32_t  axis = 0;
     vsi_size_t  axis_size = 0;
-    float    outputScale  = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale;
-    float    outputTail   = (float)outputs[0]->attr.dtype.zero_point;
-    float    inputScale   = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale;
-    float    inputTail    = (float)inputs[0]->attr.dtype.zero_point;
+    float    outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float    outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float    inputScale   = vsi_nn_get_tensor_scale(inputs[0]);
+    float    inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
     float    epsilon      = (float)10e-12;
     float    rsEps        = 1.0f / sqrtf(epsilon);
     vsi_bool is_use_u8_kernel = FALSE;
diff --git a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
index 7824a1e..20f3ab0 100644
--- a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
@@ -233,55 +233,16 @@ static vsi_nn_kernel_node_t _setup
 
     vsi_size_t width = inputs[0]->attr.size[0];
     vsi_size_t height = inputs[0]->attr.size[1];
-    int32_t input_fl = 0;
-    float input_zp = 0.0f;
-    float input_scale = 1.0f;
-    int32_t output_fl = 0;
-    float output_zp = 0.0f;
-    float output_scale = 1.0f;
+    float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float output_zp = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
     float e2InScale = 1.0f, scale_inOut = 1.0f;
     float dim_ratio = (float)1.0 / (float)(width);
     float sumZpScale = 0.0f;
     float zp2ScaleE2 = 0.0f;
     float sumZpScaleE2 = 0.0f;
 
-    if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
-    {
-        input_zp = (float)inputs[0]->attr.dtype.zero_point;
-        input_scale = inputs[0]->attr.dtype.scale;
-    }
-    else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
-    {
-        input_fl = inputs[0]->attr.dtype.fl;
-        if (input_fl > 0)
-        {
-            input_scale = (1.0f / ((float) ((int64_t)1 << input_fl)));
-        }
-        else
-        {
-            input_scale = ((float) ((int64_t)1 << -input_fl));
-        }
-        input_zp = 0.0f;
-    }
-
-    if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
-    {
-        output_zp = (float)outputs[0]->attr.dtype.zero_point;
-        output_scale = 1.0f / outputs[0]->attr.dtype.scale;
-    }
-    else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
-    {
-        output_fl = outputs[0]->attr.dtype.fl;
-        if (output_fl > 0)
-        {
-            output_scale = (float)((int64_t)1 << output_fl);
-        }
-        else
-        {
-            output_scale = (1.0f / (float)((int64_t)1 << -output_fl));
-        }
-        output_zp = 0.0f;
-    }
     scale_inOut = input_scale * output_scale;
     e2InScale = input_scale * input_scale;
     sumZpScale = width * input_zp * input_scale;
@@ -392,4 +353,3 @@ final:
 __END_DECLS
 
 REGISTER_BACKEND_CL( layer_norm, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c
index 81b0d1b..311de97 100644
--- a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c
@@ -239,10 +239,9 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_t node = NULL;
     int32_t axis = 0;
     float beta = 0;
-    float inputScale =
-        inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC ? inputs[0]->attr.dtype.scale : 1.0f;
-    float outputScale = 1.0f / outputs[0]->attr.dtype.scale;
-    float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f;
+    float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
+    float outputScale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
+    float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f;
     float scaleValue = (vx_float32)(log10(exp(1.0f)) / log10(2.0f));
 
     axis = vsi_nn_kernel_param_get_int32(params, "axis");
diff --git a/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c b/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c
index 667cca5..a7bdb2c 100644
--- a/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c
@@ -1444,7 +1444,6 @@ static vsi_status _query_kernel
     }
 
     return status;
-
 } /* _query_kernel() */
 
 
@@ -1511,65 +1510,57 @@ static vsi_nn_kernel_node_t _setup
 
     if (inputs[LSTMUNIT_ACT_INPUT_FC_I] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_INPUT_FC_I]->attr.dtype.vx_type)
     {
-        scale_val[0] = inputs[LSTMUNIT_ACT_INPUT_FC_I]->attr.dtype.scale;
-        tail_val[0]  = \
-        -inputs[LSTMUNIT_ACT_INPUT_FC_I]->attr.dtype.scale * inputs[LSTMUNIT_ACT_INPUT_FC_I]->attr.dtype.zero_point;
+        scale_val[0] = vsi_nn_get_tensor_scale(inputs[LSTMUNIT_ACT_INPUT_FC_I]);
+        tail_val[0]  = -(float)vsi_nn_get_tensor_zero_point(inputs[LSTMUNIT_ACT_INPUT_FC_I]) * scale_val[0];
     }
 
     if (inputs[LSTMUNIT_ACT_INPUT_FC_F] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_INPUT_FC_F]->attr.dtype.vx_type)
     {
-        scale_val[1] = inputs[LSTMUNIT_ACT_INPUT_FC_F]->attr.dtype.scale;
-        tail_val[1]  = \
-        -inputs[LSTMUNIT_ACT_INPUT_FC_F]->attr.dtype.scale * inputs[LSTMUNIT_ACT_INPUT_FC_F]->attr.dtype.zero_point;
+        scale_val[1] = vsi_nn_get_tensor_scale(inputs[LSTMUNIT_ACT_INPUT_FC_F]);
+        tail_val[1]  = -(float)vsi_nn_get_tensor_zero_point(inputs[LSTMUNIT_ACT_INPUT_FC_F]) * scale_val[1];
     }
 
     if (inputs[LSTMUNIT_ACT_INPUT_FC_C] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_INPUT_FC_C]->attr.dtype.vx_type)
     {
-        scale_val[2] = inputs[LSTMUNIT_ACT_INPUT_FC_C]->attr.dtype.scale;
-        tail_val[2]  = \
-        -inputs[LSTMUNIT_ACT_INPUT_FC_C]->attr.dtype.scale * inputs[LSTMUNIT_ACT_INPUT_FC_C]->attr.dtype.zero_point;
+        scale_val[2] = vsi_nn_get_tensor_scale(inputs[LSTMUNIT_ACT_INPUT_FC_C]);
+        tail_val[2]  = -(float)vsi_nn_get_tensor_zero_point(inputs[LSTMUNIT_ACT_INPUT_FC_C]) * scale_val[2];
     }
 
     if (inputs[LSTMUNIT_ACT_INPUT_FC_O] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_INPUT_FC_O]->attr.dtype.vx_type)
     {
-        scale_val[3] = inputs[LSTMUNIT_ACT_INPUT_FC_O]->attr.dtype.scale;
-        tail_val[3]  = \
-        -inputs[LSTMUNIT_ACT_INPUT_FC_O]->attr.dtype.scale * inputs[LSTMUNIT_ACT_INPUT_FC_O]->attr.dtype.zero_point;
+        scale_val[3] = vsi_nn_get_tensor_scale(inputs[LSTMUNIT_ACT_INPUT_FC_O]);
+        tail_val[3]  = -(float)vsi_nn_get_tensor_zero_point(inputs[LSTMUNIT_ACT_INPUT_FC_O]) * scale_val[3];
     }
 
 
     if (inputs[LSTMUNIT_ACT_HSTATE_FC_I] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_HSTATE_FC_I]->attr.dtype.vx_type)
     {
-        scale_val[4] = inputs[LSTMUNIT_ACT_HSTATE_FC_I]->attr.dtype.scale;
-        tail_val[4]  = \
-        -inputs[LSTMUNIT_ACT_HSTATE_FC_I]->attr.dtype.scale * inputs[LSTMUNIT_ACT_HSTATE_FC_I]->attr.dtype.zero_point;
+        scale_val[4] = vsi_nn_get_tensor_scale(inputs[LSTMUNIT_ACT_HSTATE_FC_I]);
+        tail_val[4]  = -(float)vsi_nn_get_tensor_zero_point(inputs[LSTMUNIT_ACT_HSTATE_FC_I]) * scale_val[4];
     }
 
     if (inputs[LSTMUNIT_ACT_HSTATE_FC_F] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_HSTATE_FC_F]->attr.dtype.vx_type)
     {
-        scale_val[5] = inputs[LSTMUNIT_ACT_HSTATE_FC_F]->attr.dtype.scale;
-        tail_val[5]  = \
-        -inputs[LSTMUNIT_ACT_HSTATE_FC_F]->attr.dtype.scale * inputs[LSTMUNIT_ACT_HSTATE_FC_F]->attr.dtype.zero_point;
+        scale_val[5] = vsi_nn_get_tensor_scale(inputs[LSTMUNIT_ACT_HSTATE_FC_F]);
+        tail_val[5]  = -(float)vsi_nn_get_tensor_zero_point(inputs[LSTMUNIT_ACT_HSTATE_FC_F]) * scale_val[5];
     }
 
     if (inputs[LSTMUNIT_ACT_HSTATE_FC_C] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_HSTATE_FC_C]->attr.dtype.vx_type)
     {
-        scale_val[6] = inputs[LSTMUNIT_ACT_HSTATE_FC_C]->attr.dtype.scale;
-        tail_val[6]  = \
-        -inputs[LSTMUNIT_ACT_HSTATE_FC_C]->attr.dtype.scale * inputs[LSTMUNIT_ACT_HSTATE_FC_C]->attr.dtype.zero_point;
+        scale_val[6] = vsi_nn_get_tensor_scale(inputs[LSTMUNIT_ACT_HSTATE_FC_C]);
+        tail_val[6]  = -(float)vsi_nn_get_tensor_zero_point(inputs[LSTMUNIT_ACT_HSTATE_FC_C]) * scale_val[6];
     }
 
     if (inputs[LSTMUNIT_ACT_HSTATE_FC_O] && VSI_NN_TYPE_UINT8 == inputs[LSTMUNIT_ACT_HSTATE_FC_O]->attr.dtype.vx_type)
     {
-        scale_val[7] = inputs[LSTMUNIT_ACT_HSTATE_FC_O]->attr.dtype.scale;
-        tail_val[7]  = \
-        -inputs[LSTMUNIT_ACT_HSTATE_FC_O]->attr.dtype.scale * inputs[LSTMUNIT_ACT_HSTATE_FC_O]->attr.dtype.zero_point;
+        scale_val[7] = vsi_nn_get_tensor_scale(inputs[LSTMUNIT_ACT_HSTATE_FC_O]);
+        tail_val[7]  = -(float)vsi_nn_get_tensor_zero_point(inputs[LSTMUNIT_ACT_HSTATE_FC_O]) * scale_val[7];
     }
 
     if (outputs[LSTMUNIT_ACT_OUTPUT] && VSI_NN_TYPE_UINT8 == outputs[LSTMUNIT_ACT_OUTPUT]->attr.dtype.vx_type)
     {
-        scale_val[8] = 1.0f / outputs[LSTMUNIT_ACT_OUTPUT]->attr.dtype.scale;
-        tail_val[8]  = (float)(outputs[LSTMUNIT_ACT_OUTPUT]->attr.dtype.zero_point);
+        scale_val[8] = 1.0f / vsi_nn_get_tensor_scale(inputs[LSTMUNIT_ACT_OUTPUT]);
+        tail_val[8]  = (float)vsi_nn_get_tensor_zero_point(inputs[LSTMUNIT_ACT_OUTPUT]);
     }
 
     if( VSI_SUCCESS == status)
@@ -1645,4 +1636,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( lstmunit_activation, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
index e28a548..35eb757 100644
--- a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c
@@ -253,12 +253,12 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t depth = outputs[0]->attr.dim_num > 2 ? outputs[0]->attr.size[2] : 1;
     uint32_t ac2zero = 0;
     uint32_t bc2zero = 0;
-    float    scale_a = 1.0f;
-    float    zp_a = 0;
-    float    scale_b = 1.0f;
-    float    zp_b = 0;
-    float    scale_out = 1.0f;
-    float    zp_out = 0;
+    float    scale_a = vsi_nn_get_tensor_scale(inputs[0]);
+    float    zp_a = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float    scale_b = vsi_nn_get_tensor_scale(inputs[1]);
+    float    zp_b = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
+    float    scale_out = vsi_nn_get_tensor_scale(outputs[0]);
+    float    zp_out = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
 
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
@@ -272,60 +272,6 @@ static vsi_nn_kernel_node_t _setup
         transFlg = 2;
     }
 
-    if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
-    {
-        if (inputs[0]->attr.dtype.fl > 0)
-        {
-            scale_a = (1.0f / ((float) ((int64_t)1 << inputs[0]->attr.dtype.fl)));
-        }
-        else
-        {
-            scale_a = ((float) ((int64_t)1 << -inputs[0]->attr.dtype.fl));
-        }
-        zp_a = 0;
-    }
-    else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
-    {
-        zp_a = (float)inputs[0]->attr.dtype.zero_point;
-        scale_a = inputs[0]->attr.dtype.scale;
-    }
-
-    if (inputs[1]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
-    {
-        if (inputs[1]->attr.dtype.fl > 0)
-        {
-            scale_b = (1.0f / ((float) ((int64_t)1 << inputs[1]->attr.dtype.fl)));
-        }
-        else
-        {
-            scale_b = ((float) ((int64_t)1 << -inputs[1]->attr.dtype.fl));
-        }
-        zp_b = 0;
-    }
-    else if (inputs[1]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
-    {
-        zp_b = (float)inputs[1]->attr.dtype.zero_point;
-        scale_b = inputs[1]->attr.dtype.scale;
-    }
-
-    if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
-    {
-        if (outputs[0]->attr.dtype.fl > 0)
-        {
-            scale_out = (float)((int64_t)1 << outputs[0]->attr.dtype.fl);
-        }
-        else
-        {
-            scale_out = (1.0f / (float)((int64_t)1 << -outputs[0]->attr.dtype.fl));
-        }
-        zp_out = 0;
-    }
-    else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
-    {
-        zp_out = (float)outputs[0]->attr.dtype.zero_point;
-        scale_out = outputs[0]->attr.dtype.scale;
-    }
-
     if (transposeA)
     {
         K = inputs[0]->attr.size[1];
@@ -389,4 +335,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( matrixmul, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c
index 98a175f..322bd22 100644
--- a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c
@@ -239,12 +239,12 @@ static vsi_nn_kernel_node_t _setup
     vsi_bool image_2d = FALSE;
     vsi_nn_kernel_node_t node = NULL;
 
-    float input0Scale = inputs[0]->attr.dtype.scale;
-    float input0Tail = (float)inputs[0]->attr.dtype.zero_point * input0Scale;
-    float input1Scale = inputs[1]->attr.dtype.scale;
-    float input1Tail = (float)inputs[1]->attr.dtype.zero_point * input1Scale;
-    float outputScale = outputs[0]->attr.dtype.scale;
-    float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f;
+    float input0Scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0Scale;
+    float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
+    float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale;
+    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
+    float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f;
 
     outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
 
@@ -294,4 +294,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( maximum, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c
index a730f0b..40b9977 100644
--- a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c
@@ -238,12 +238,12 @@ static vsi_nn_kernel_node_t _setup
     vsi_bool image_2d = FALSE;
     vsi_nn_kernel_node_t node = NULL;
 
-    float input0Scale = inputs[0]->attr.dtype.scale;
-    float input0Tail = (float)inputs[0]->attr.dtype.zero_point * input0Scale;
-    float input1Scale = inputs[1]->attr.dtype.scale;
-    float input1Tail = (float)inputs[1]->attr.dtype.zero_point * input1Scale;
-    float outputScale = outputs[0]->attr.dtype.scale;
-    float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f;
+    float input0Scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0Scale;
+    float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
+    float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale;
+    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
+    float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f;
 
     outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
 
@@ -293,4 +293,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( minimum, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/moments_cl.c b/src/tim/vx/internal/src/kernel/cl/moments_cl.c
index 0a04c13..ed420ad 100644
--- a/src/tim/vx/internal/src/kernel/cl/moments_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/moments_cl.c
@@ -372,25 +372,12 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t width = inputs[0]->attr.size[0];
     vsi_size_t height = inputs[0]->attr.size[1];
     vsi_size_t chn = inputs[0]->attr.size[2];
-    int32_t input_zp = inputs[0]->attr.dtype.zero_point;
-    float input_scale = inputs[0]->attr.dtype.scale;
+    int32_t input_zp = vsi_nn_get_tensor_zero_point(inputs[0]);
+    float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
     float dim_ratio = (float)1.0 / (float)(width * height);
 
     axis_num = (int32_t)axis_num_temp;
 
-    if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
-    {
-        if (inputs[0]->attr.dtype.fl > 0)
-        {
-            input_scale = (1.0f / ((float) ((int64_t)1 << inputs[0]->attr.dtype.fl)));
-        }
-        else
-        {
-            input_scale = ((float) ((int64_t)1 << -inputs[0]->attr.dtype.fl));
-        }
-        input_zp = 0;
-    }
-
     if (axis_num == 1 && axis[0] == 0)
     {
         dim_ratio = (float)1.0 / (float)(width);
@@ -453,7 +440,7 @@ static vsi_nn_kernel_node_t _setup
         if ( node )
         {
             uint32_t index = 0;
-            int32_t constant_value = 0;
+            int32_t constant_value = vsi_nn_get_tensor_zero_point(inputs[0]);
             /* Pass parameters to node. */
             if (reshape_tensors[0])
             {
@@ -494,10 +481,6 @@ static vsi_nn_kernel_node_t _setup
                 vsi_nn_kernel_tensor_release( &node_params[2] );
             }
 
-            if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
-            {
-                constant_value = inputs[0]->attr.dtype.zero_point;
-            }
             status = set_constant_border(node, constant_value);
             CHECK_STATUS(status);
         }
diff --git a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
index 3aa26fd..33b575f 100644
--- a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
@@ -239,8 +239,8 @@ static vsi_nn_kernel_node_t _setup
     float on_value = vsi_nn_kernel_param_get_float32( params, "on_value" );
     float off_value = vsi_nn_kernel_param_get_float32( params, "off_value" );
     int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
-    float inputScale = inputs[0]->attr.dtype.scale;
-    float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale;
+    float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
+    float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * inputScale;
 
     out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
diff --git a/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c b/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c
index 73b264e..558a1e0 100644
--- a/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c
@@ -242,10 +242,10 @@ static vsi_nn_kernel_node_t _setup
     int32_t  pad_x    = 0;
     int32_t  pad_y    = 0;
     vsi_bool image_2d = FALSE;
-    float    outputScale  = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale;
-    float    outputTail   = (float)outputs[0]->attr.dtype.zero_point;
-    float    inputScale   = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale;
-    float    inputTail    = (float)inputs[0]->attr.dtype.zero_point;
+    float    outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float    outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float    inputScale   = vsi_nn_get_tensor_scale(inputs[0]);
+    float    inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
     float    scale_value  = 1.0f;
     float    tail_value   = 0.0f;
     vsi_bool is_use_u8_kernel = FALSE;
@@ -303,7 +303,6 @@ static vsi_nn_kernel_node_t _setup
                 vsi_nn_kernel_scalar_release( &node_params[SCALAR_SCALE] );
                 vsi_nn_kernel_scalar_release( &node_params[SCALAR_TAIL] );
             }
-
         }
     }
     return node;
@@ -312,4 +311,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( poolwithargmax, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/prelu_cl.c b/src/tim/vx/internal/src/kernel/cl/prelu_cl.c
index 7bbfbec..609c90e 100644
--- a/src/tim/vx/internal/src/kernel/cl/prelu_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/prelu_cl.c
@@ -240,12 +240,12 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t new_rank = 0;
     vsi_bool ret;
 
-    float input0Scale = inputs[0]->attr.dtype.scale;
-    float input0Tail = (float)inputs[0]->attr.dtype.zero_point * input0Scale;
-    float input1Scale = inputs[1]->attr.dtype.scale;
-    float input1Tail = (float)inputs[1]->attr.dtype.zero_point * input1Scale;
-    float outputScale = outputs[0]->attr.dtype.scale;
-    float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f;
+    float input0Scale = vsi_nn_get_tensor_scale(inputs[0]);
+    float input0Tail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * input0Scale;
+    float input1Scale = vsi_nn_get_tensor_scale(inputs[1]);
+    float input1Tail = (float)vsi_nn_get_tensor_zero_point(inputs[1]) * input1Scale;
+    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
+    float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
     int32_t is_per_channel_alpha = 0;
 
     is_per_channel_alpha = vsi_nn_kernel_param_get_int32(params, "is_per_channel_alpha");
@@ -257,6 +257,11 @@ static vsi_nn_kernel_node_t _setup
 
     outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
 
+    if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
+    {
+        outputZP += 0.5f;
+    }
+
     ret = vsi_nn_kernel_optimize_eltwise_shape(
             inputs[0]->attr.size, inputs[0]->attr.dim_num,
             inputs[1]->attr.size, inputs[1]->attr.dim_num,
@@ -329,4 +334,3 @@ final:
 __END_DECLS
 
 REGISTER_BACKEND_CL( prelu, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c
index 3e3d4bd..05a8674 100644
--- a/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c
@@ -81,7 +81,6 @@ static const _kernel_map_type _reducemax_internal_kernel_map[] =
     HASH_REDUCEMAX_KERNELS_2D( 1, F32, F32 )
     HASH_REDUCEMAX_KERNELS_2D( 1, I32, I32 )
     HASH_REDUCEMAX_KERNELS_2D( 1, U8,  U8 )
-
 };
 
 
@@ -236,10 +235,10 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_t node = NULL;
     vsi_bool image_2d = FALSE;
     int32_t  axis = 0;
-    float   outputScale  = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale;
-    float   outputZP     = (float)outputs[0]->attr.dtype.zero_point;
-    float   inputScale   = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale;
-    float   inputTail    = (float)inputs[0]->attr.dtype.zero_point;
+    float   outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float   outputZP     = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   inputScale   = vsi_nn_get_tensor_scale(inputs[0]);
+    float   inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
 
     inputScale = inputScale / outputScale;
     inputTail  = outputZP - inputTail * inputScale;
@@ -281,4 +280,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( reducemax_internal, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c
index 1658fa4..50a5025 100644
--- a/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c
@@ -225,10 +225,10 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_t node = NULL;
     vsi_bool image_2d = FALSE;
     int32_t  axis = 0;
-    float   outputScale  = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale;
-    float   outputZP     = (float)outputs[0]->attr.dtype.zero_point;
-    float   inputScale   = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale;
-    float   inputTail    = (float)inputs[0]->attr.dtype.zero_point;
+    float   outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float   outputZP     = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   inputScale   = vsi_nn_get_tensor_scale(inputs[0]);
+    float   inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
 
     inputScale = inputScale / outputScale;
     inputTail  = outputZP - inputTail * inputScale;
@@ -270,4 +270,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( reducemin_internal, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c
index b1feb05..8d1b7c0 100644
--- a/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c
@@ -247,10 +247,10 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_t node = NULL;
     vsi_bool image_2d = FALSE;
     int32_t  axis = 0;
-    float    outputScale  = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale;
-    float    outputTail   = (float)outputs[0]->attr.dtype.zero_point;
-    float    inputScale   = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale;
-    float    inputTail    = (float)inputs[0]->attr.dtype.zero_point;
+    float   outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float   outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   inputScale   = vsi_nn_get_tensor_scale(inputs[0]);
+    float   inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
     vsi_bool is_use_u8_kernel = FALSE;
 
     outputScale = 1.0f / outputScale;
@@ -304,4 +304,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( reduceprod_internal, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c b/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c
index d08676c..8cfd331 100644
--- a/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c
@@ -225,7 +225,6 @@ static vsi_status _query_kernel
     }
 
     return status;
-
 } /* _query_kernel() */
 
 
@@ -244,10 +243,10 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_RELU_KERAS_QUANT_PARAM_NUM] = {NULL};
     vsi_nn_kernel_node_t node = NULL;
     vsi_bool image_2d = FALSE;
-    float    outputScale  = 1.0f;
-    float    outputTail   = 0.0f;
-    float    inputScale   = 1.0f;
-    float    inputTail    = 0.0f;
+    float    outputScale  = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
+    float    outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float    inputScale   = vsi_nn_get_tensor_scale(inputs[0]);
+    float    inputTail    =  -1 * (float)vsi_nn_get_tensor_zero_point(inputs[0]) * inputScale;
     vsi_bool is_use_u8_kernel = FALSE;
     float    alpha        = vsi_nn_kernel_param_get_float32( params, "alpha" );
     float    max_value    = vsi_nn_kernel_param_get_float32( params, "max_value" );
@@ -260,19 +259,6 @@ static vsi_nn_kernel_node_t _setup
         return NULL;
     }
 
-    if (VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC == inputs[0]->attr.dtype.qnt_type)
-    {
-        inputScale   = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale;
-        inputTail    = -((float)inputs[0]->attr.dtype.zero_point * inputScale);
-    }
-
-    if (VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC == outputs[0]->attr.dtype.qnt_type)
-    {
-        outputScale  = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale;
-        outputScale = 1.0f / outputScale;
-        outputTail   = (float)outputs[0]->attr.dtype.zero_point;
-    }
-
     image_2d = (inputs[0]->attr.dim_num == 2 || inputs[0]->attr.size[2] == 1);
     status = _query_kernel( kernel, inputs, outputs, image_2d, &is_use_u8_kernel );
 
@@ -316,10 +302,8 @@ static vsi_nn_kernel_node_t _setup
     }
 
     return node;
-
 } /* _setup() */
 
 __END_DECLS
 
 REGISTER_BACKEND_CL( relu_keras, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c
index 3a189f4..fda7acd 100644
--- a/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c
@@ -235,11 +235,11 @@ static vsi_nn_kernel_node_t _setup
     int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
     vsi_size_t in_width     = inputs[0]->attr.size[0];
     vsi_size_t out_width    = outputs[0]->attr.size[0];
-    float   input_zp     = (float)inputs[0]->attr.dtype.zero_point;
-    float   input_scale  = inputs[0]->attr.dtype.scale;
+    float   input_zp     = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float   input_scale  = vsi_nn_get_tensor_scale(inputs[0]);
     float   input_tail   = -(input_zp * input_scale);
-    float   output_zp    = (float)outputs[0]->attr.dtype.zero_point;
-    float   output_scale = (0 == outputs[0]->attr.dtype.scale) ? 1.0f : 1.0f / outputs[0]->attr.dtype.scale;
+    float   output_zp    = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
     float   half_pixel_value = 0.0f;
     float   scale_factor_x = 0.0f;
     vsi_bool is_use_u8_kernel = FALSE;
@@ -302,4 +302,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( resize_1d_bilinear, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c
index e406397..eef5bec 100644
--- a/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c
@@ -235,11 +235,10 @@ static vsi_nn_kernel_node_t _setup
     int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
     vsi_size_t in_width     = inputs[0]->attr.size[0];
     vsi_size_t out_width    = outputs[0]->attr.size[0];
-    float   input_zp     = (float)inputs[0]->attr.dtype.zero_point;
-    float   input_scale  = inputs[0]->attr.dtype.scale;
-    float   output_scale = (0 == outputs[0]->attr.dtype.scale) ? \
-                           input_scale : input_scale / outputs[0]->attr.dtype.scale;
-    float   output_tail  = (float)outputs[0]->attr.dtype.zero_point - input_zp * output_scale;
+    float   input_zp     = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float   input_scale  = vsi_nn_get_tensor_scale(inputs[0]);
+    float   output_scale = input_scale / vsi_nn_get_tensor_scale(outputs[0]);
+    float   output_tail  = (float)vsi_nn_get_tensor_zero_point(outputs[0]) - input_zp * output_scale;
     float   half_pixel_value = 0.0f;
     float   round_value    = 0.0f;
     float   scale_factor_x = 0.0f;
@@ -309,4 +308,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( resize_1d_nearest, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c
index 320a6d9..a9c0285 100644
--- a/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c
@@ -213,7 +213,6 @@ static vsi_status _query_kernel
     }
 
     return status;
-
 } /* _query_kernel() */
 
 
@@ -237,11 +236,11 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t in_height    = inputs[0]->attr.size[1];
     vsi_size_t out_width    = outputs[0]->attr.size[0];
     vsi_size_t out_height   = outputs[0]->attr.size[1];
-    float   input_zp     = (float)inputs[0]->attr.dtype.zero_point;
-    float   input_scale  = inputs[0]->attr.dtype.scale;
+    float   input_zp     = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float   input_scale  = vsi_nn_get_tensor_scale(inputs[0]);
     float   input_tail   = -(input_zp * input_scale);
-    float   output_zp    = (float)outputs[0]->attr.dtype.zero_point;
-    float   output_scale = (0 == outputs[0]->attr.dtype.scale) ? 1.0f : 1.0f / outputs[0]->attr.dtype.scale;
+    float   output_zp    = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
     float   half_pixel_value = 0.0f;
     float   scale_factor_x = 0.0f;
     float   scale_factor_y = 0.0f;
@@ -313,10 +312,8 @@ static vsi_nn_kernel_node_t _setup
     }
 
     return node;
-
 } /* _setup() */
 
 __END_DECLS
 
 REGISTER_BACKEND_CL( resize_bilinear, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c
index 588b527..d61abcf 100644
--- a/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c
@@ -241,11 +241,10 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t in_height    = inputs[0]->attr.size[1];
     vsi_size_t out_width    = outputs[0]->attr.size[0];
     vsi_size_t out_height   = outputs[0]->attr.size[1];
-    float   input_zp     = (float)inputs[0]->attr.dtype.zero_point;
-    float   input_scale  = inputs[0]->attr.dtype.scale;
-    float   output_scale = (0 == outputs[0]->attr.dtype.scale) ? \
-                           input_scale : input_scale / outputs[0]->attr.dtype.scale;
-    float   output_tail  = (float)outputs[0]->attr.dtype.zero_point - input_zp * output_scale;
+    float   input_zp     = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float   input_scale  = vsi_nn_get_tensor_scale(inputs[0]);
+    float   output_scale = input_scale / vsi_nn_get_tensor_scale(outputs[0]);
+    float   output_tail  = (float)vsi_nn_get_tensor_zero_point(outputs[0]) - input_zp * output_scale;
     float   half_pixel_value = 0.0f;
     float   round_value    = 0.0f;
     float   scale_factor_x = 0.0f;
@@ -327,4 +326,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( resize_nearest, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/select_cl.c b/src/tim/vx/internal/src/kernel/cl/select_cl.c
index 9c00e23..53b1fcd 100644
--- a/src/tim/vx/internal/src/kernel/cl/select_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/select_cl.c
@@ -240,12 +240,12 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t node_params[_SELECT_PARAM_NUM] = {NULL};
     vsi_bool image_2d = FALSE;
     vsi_nn_kernel_node_t node = NULL;
-    float   outputScale  = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale;
-    float   outputZP     = (float)outputs[0]->attr.dtype.zero_point;
-    float   input0Scale  = inputs[1]->attr.dtype.scale == 0.0f ? 1.0f : inputs[1]->attr.dtype.scale;
-    float   input0Tail   = (float)inputs[1]->attr.dtype.zero_point;
-    float   input1Scale  = inputs[2]->attr.dtype.scale == 0.0f ? 1.0f : inputs[2]->attr.dtype.scale;
-    float   input1Tail   = (float)inputs[2]->attr.dtype.zero_point;
+    float   outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float   outputZP     = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float   input0Scale  = vsi_nn_get_tensor_scale(inputs[1]);
+    float   input0Tail   = (float)vsi_nn_get_tensor_zero_point(inputs[1]);
+    float   input1Scale  = vsi_nn_get_tensor_scale(inputs[2]);
+    float   input1Tail   = (float)vsi_nn_get_tensor_zero_point(inputs[2]);
 
     input0Scale = input0Scale / outputScale;
     input1Scale = input1Scale / outputScale;
@@ -289,4 +289,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( select, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c b/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c
index 58fed76..d65200d 100644
--- a/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c
@@ -246,14 +246,12 @@ static vsi_nn_kernel_node_t _setup
     int32_t max_len  = vsi_nn_kernel_param_get_int32( params, "max_len" );
     vsi_nn_kernel_node_t node = NULL;
     int32_t is2Dflg = 0;
-    float input_zp = 0;
-    float input_scale = 1.0f;
-    int32_t output_zp = 0;
-    float output_scale = 1.0f;
+    float input_zp = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float input_scale = vsi_nn_get_tensor_scale(inputs[0]);
+    int32_t output_zp = vsi_nn_get_tensor_zero_point(outputs[0]);
+    float output_scale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
     float input_zpScale = 0;
     float outputVal1 = 1.0f;
-    int32_t input_fl = 0;
-    int32_t output_fl = 0;
 
     if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
@@ -269,43 +267,6 @@ static vsi_nn_kernel_node_t _setup
     rs_input = vsi_nn_kernel_tensor_reshape(inputs[0]->t, new_shape[0], 2);
     rs_output = vsi_nn_kernel_tensor_reshape(outputs[0]->t, new_shape[1], 4);
 
-    if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
-    {
-        input_zp = (float)inputs[0]->attr.dtype.zero_point;
-        input_scale = inputs[0]->attr.dtype.scale;
-    }
-    else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
-    {
-        input_fl = inputs[0]->attr.dtype.fl;
-        if (input_fl > 0)
-        {
-            input_scale = (1.0f / ((float) ((int64_t)1 << input_fl)));
-        }
-        else
-        {
-            input_scale = ((float) ((int64_t)1 << -input_fl));
-        }
-        input_zp = 0.0f;
-    }
-
-    if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
-    {
-        output_zp = outputs[0]->attr.dtype.zero_point;
-        output_scale = 1.0f / outputs[0]->attr.dtype.scale;
-    }
-    else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
-    {
-        output_fl = outputs[0]->attr.dtype.fl;
-        if (output_fl > 0)
-        {
-            output_scale = (float)((int64_t)1 << output_fl);
-        }
-        else
-        {
-            output_scale = (1.0f / (float)((int64_t)1 << -output_fl));
-        }
-        output_zp = 0;
-    }
     input_zpScale = input_scale * input_zp;
     outputVal1 = output_scale + (float)output_zp;
 
@@ -351,4 +312,3 @@ final:
 __END_DECLS
 
 REGISTER_BACKEND_CL( sequence_mask, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/slice_cl.c b/src/tim/vx/internal/src/kernel/cl/slice_cl.c
index ed83e5f..4900bb1 100644
--- a/src/tim/vx/internal/src/kernel/cl/slice_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/slice_cl.c
@@ -240,10 +240,10 @@ static vsi_nn_kernel_node_t _setup
     int32_t i = 0;
     vsi_size_t input_batch = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1;
     vsi_size_t output_batch = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1;
-    float inputScale = inputs[0]->attr.dtype.scale;
-    float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale;
-    float outputScale = outputs[0]->attr.dtype.scale;
-    float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f;
+    float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
+    float inputTail = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * inputScale;
+    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
+    float outputZP = (float)vsi_nn_get_tensor_zero_point(outputs[0]) + 0.5f;
 
     outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale;
 
diff --git a/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c
index 65b09ee..7c7a59a 100644
--- a/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c
@@ -208,50 +208,13 @@ static vsi_nn_kernel_node_t _setup
     int32_t block_size_y  = vsi_nn_kernel_param_get_int32( params, "block_size_y" );
     int32_t opt_flg = (block_size_x == 2 && block_size_y == 1) ? 1 : 0;
 
-    float inputScale = inputs[0]->attr.dtype.scale;
-    int32_t inputZp = inputs[0]->attr.dtype.zero_point;
-    float outputScale = outputs[0]->attr.dtype.scale;
-    int32_t outputZp = outputs[0]->attr.dtype.zero_point;
+    float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
+    int32_t inputZp = vsi_nn_get_tensor_zero_point(inputs[0]);
+    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
+    int32_t outputZp = vsi_nn_get_tensor_zero_point(outputs[0]);
     float scaleInOut = 1.0f;
     float zpInOut = 0.0f;
 
-    if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
-    {
-        int32_t input_fl = inputs[0]->attr.dtype.fl;
-        if (input_fl > 0)
-        {
-            inputScale = (1.0f / ((float) ((int64_t)1 << input_fl)));
-        }
-        else
-        {
-            inputScale = ((float) ((int64_t)1 << -input_fl));
-        }
-        inputZp = 0;
-    }
-    else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE)
-    {
-        inputScale = 1.0f;
-        inputZp = 0;
-    }
-
-    if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP)
-    {
-        int32_t output_fl = outputs[0]->attr.dtype.fl;
-        if (output_fl > 0)
-        {
-            outputScale = (1.0f / ((float) ((int64_t)1 << output_fl)));
-        }
-        else
-        {
-            outputScale = ((float) ((int64_t)1 << -output_fl));
-        }
-        outputZp = 0;
-    }
-    else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_NONE)
-    {
-        outputScale = 1.0f;
-        outputZp = 0;
-    }
     scaleInOut = inputScale / outputScale;
     zpInOut = outputZp - inputZp * scaleInOut;
 
@@ -295,4 +258,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( space2depth_internal, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/swish_cl.c b/src/tim/vx/internal/src/kernel/cl/swish_cl.c
index 4a4283e..4c3f206 100644
--- a/src/tim/vx/internal/src/kernel/cl/swish_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/swish_cl.c
@@ -279,10 +279,10 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_t node = NULL;
     int32_t swish_type  = vsi_nn_kernel_param_get_int32( params, "type" );
     float   beta        = 1.0f;
-    float   inputScale  = inputs[0]->attr.dtype.scale;
-    float   inputTail   = (float)inputs[0]->attr.dtype.zero_point * inputScale;
-    float   outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 0.0f : 1.0f / outputs[0]->attr.dtype.scale;
-    float   outputZP    = (float)outputs[0]->attr.dtype.zero_point + 0.5f;
+    float   inputScale  = vsi_nn_get_tensor_scale(inputs[0]);
+    float   inputTail   = (float)vsi_nn_get_tensor_zero_point(inputs[0]) * inputScale;
+    float   outputScale = 1.0f / vsi_nn_get_tensor_scale(outputs[0]);
+    float   outputZP    = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
     vx_float32  logE    = (vx_float32)(log10(exp(1.0f)) / log10(2.0f));
     vsi_bool ret = FALSE;
 
@@ -353,7 +353,6 @@ static vsi_nn_kernel_node_t _setup
                 vsi_nn_kernel_scalar_release( &node_params[SCALAR_BETA] );
                 vsi_nn_kernel_scalar_release( &node_params[SCALAR_LOGE] );
             }
-
         }
     }
 
@@ -372,4 +371,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( swish, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cl/upsample_cl.c b/src/tim/vx/internal/src/kernel/cl/upsample_cl.c
index f2e990c..6f46988 100644
--- a/src/tim/vx/internal/src/kernel/cl/upsample_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/upsample_cl.c
@@ -232,7 +232,6 @@ static vsi_status _query_kernel
     }
 
     return status;
-
 } /* _query_kernel() */
 
 
@@ -254,11 +253,11 @@ static vsi_nn_kernel_node_t _setup
     int32_t  scale_y  = 0;
     vsi_bool image_2d = FALSE;
     vsi_bool is_use_u8_kernel = FALSE;
-    float    outputScale  = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale;
-    float    outputTail   = (float)outputs[0]->attr.dtype.zero_point;
-    float    inputScale   = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale;
-    float    inputTail    = (float)inputs[0]->attr.dtype.zero_point;
-    int32_t  outputZp      = outputs[0]->attr.dtype.zero_point;
+    float    outputScale  = vsi_nn_get_tensor_scale(outputs[0]);
+    float    outputTail   = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float    inputScale   = vsi_nn_get_tensor_scale(inputs[0]);
+    float    inputTail    = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    int32_t  outputZp     = vsi_nn_get_tensor_zero_point(outputs[0]);
     float    scale_value  = 1.0f;
     float    tail_value   = 0.0f;
 
@@ -314,10 +313,8 @@ static vsi_nn_kernel_node_t _setup
     }
 
     return node;
-
 } /* _setup() */
 
 __END_DECLS
 
 REGISTER_BACKEND_CL( upsample, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/cpu/cpu_backend_conv2d_cpu.c b/src/tim/vx/internal/src/kernel/cpu/cpu_backend_conv2d_cpu.c
new file mode 100644
index 0000000..d273df6
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/cpu_backend_conv2d_cpu.c
@@ -0,0 +1,259 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "cpu_backend/npuref_interface.h"
+
+__BEGIN_DECLS
+
+typedef enum
+{
+    PARAM_INPUT = 0,
+    PARAM_KERNEL,
+    PARAM_BIAS,
+    PARAM_OUTPUT,
+    PARAM_STRIDE_0,
+    PARAM_STRIDE_1,
+    PARAM_PAD_0,
+    PARAM_PAD_1,
+    PARAM_PAD_2,
+    PARAM_PAD_3,
+    PARAM_DILATION_0,
+    PARAM_DILATION_1,
+    PARAM_MULTIPLIER,
+    PARAM_NUM
+} param_index_e;
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (3)
+#define _OUTPUT_NUM         (1)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.cpu_backend_conv2d")
+#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
+/*
+ * Kernel params
+ */
+static vx_param_description_t _cpu_backend_conv2d_kernel_param_def[] =
+{
+    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL },
+    { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+};
+#define _CPU_BACKEND_CONV2D_PARAM_NUM  _cnt_of_array( _cpu_backend_conv2d_kernel_param_def )
+
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_attr_t * attr[_IO_NUM] = { NULL };
+    int32_t strides[2];
+    int32_t pad[4];
+    int32_t dilation[2];
+    void * buffer[_IO_NUM] = { NULL };
+    int32_t i = 0;
+    vsi_nn_kernel_tensor_t tensors[_IO_NUM] = { NULL };
+    size_t out_elements = 0;
+
+    tensors[0] = (vsi_nn_kernel_tensor_t)param[PARAM_INPUT];
+    tensors[1] = (vsi_nn_kernel_tensor_t)param[PARAM_KERNEL];
+    tensors[2] = (vsi_nn_kernel_tensor_t)param[PARAM_BIAS];
+    tensors[3] = (vsi_nn_kernel_tensor_t)param[PARAM_OUTPUT];
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+    if ( param[PARAM_BIAS] )
+    {
+        attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
+        CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
+    }
+    attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
+    CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final );
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
+
+    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_STRIDE_0], &strides[0] );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_STRIDE_0], &strides[1] );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_0], &pad[0] );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_1], &pad[1] );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_2], &pad[2] );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_2], &pad[3] );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_DILATION_0], &dilation[0] );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_DILATION_1], &dilation[1] );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], FALSE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
+
+    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE );
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create kernel buffer fail.", final );
+    if ( param[PARAM_BIAS] )
+    {
+        buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], FALSE );
+        CHECK_PTR_FAIL_GOTO( buffer[2], "Create bias buffer fail.", final );
+    }
+    buffer[3] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[3], attr[3], FALSE );
+    CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final );
+
+    npuref_interface_quant_conv2d(buffer[0], attr[0],
+        buffer[1], attr[1], buffer[2],
+        pad, strides, dilation, attr[3], buffer[3]);
+
+    status = vsi_nn_kernel_tensor_write( tensors[3], attr[3],
+        buffer[3], out_elements );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+final:
+    for ( i = 0; i < _IO_NUM; i ++ )
+    {
+        if ( attr[i] )
+        {
+            vsi_nn_kernel_tensor_attr_release( &attr[i] );
+        }
+        if ( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+    }
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _cpu_backend_conv2d_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _cpu_backend_conv2d_kernel_param_def );
+    status = VSI_SUCCESS;
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CPU_BACKEND_CONV2D_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    size_t size = 0;
+    int32_t* stride = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "stride", &size);
+    int32_t* pad = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "pad", &size);
+    int32_t* dilation = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "dilation", &size);
+    int32_t multiplier = vsi_nn_kernel_param_get_int32(params, "multiplier");
+
+    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _CPU_BACKEND_CONV2D_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &stride[0] );
+            node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &stride[1] );
+            node_params[6] = vsi_nn_kernel_scalar_create( graph, I32, &pad[0] );
+            node_params[7] = vsi_nn_kernel_scalar_create( graph, I32, &pad[1] );
+            node_params[8] = vsi_nn_kernel_scalar_create( graph, I32, &pad[2] );
+            node_params[9] = vsi_nn_kernel_scalar_create( graph, I32, &pad[3] );
+            node_params[10] = vsi_nn_kernel_scalar_create( graph, I32, &dilation[0] );
+            node_params[11] = vsi_nn_kernel_scalar_create( graph, I32, &dilation[1] );
+            node_params[12] = vsi_nn_kernel_scalar_create( graph, I32, &multiplier );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CPU_BACKEND_CONV2D_PARAM_NUM );
+
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+            vsi_nn_kernel_scalar_release( &node_params[11] );
+            vsi_nn_kernel_scalar_release( &node_params[12] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( cpu_backend_conv2d, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cpu/cpu_backend_deconv2d_cpu.c b/src/tim/vx/internal/src/kernel/cpu/cpu_backend_deconv2d_cpu.c
new file mode 100644
index 0000000..b1502a5
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/cpu_backend_deconv2d_cpu.c
@@ -0,0 +1,245 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "cpu_backend/npuref_interface.h"
+
+__BEGIN_DECLS
+
+typedef enum
+{
+    PARAM_INPUT = 0,
+    PARAM_KERNEL,
+    PARAM_BIAS,
+    PARAM_OUTPUT,
+    PARAM_STRIDE_0,
+    PARAM_STRIDE_1,
+    PARAM_PAD_0,
+    PARAM_PAD_1,
+    PARAM_PAD_2,
+    PARAM_PAD_3,
+
+    PARAM_NUM
+} param_index_e;
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (3)
+#define _OUTPUT_NUM         (1)
+#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.cpu_backend_deconv2d")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _cpu_backend_deconv2d_kernel_param_def[] =
+{
+    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL },
+    { VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+    { VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },
+};
+#define _CPU_BACKEND_DECONV2D_PARAM_NUM  _cnt_of_array( _cpu_backend_deconv2d_kernel_param_def )
+
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_attr_t * attr[_IO_NUM] = { NULL };
+    int32_t strides[2];
+    int dilation[2] = {1, 1};
+    int32_t pad[4];
+    void * buffer[_IO_NUM] = { NULL };
+    int32_t i = 0;
+    vsi_nn_kernel_tensor_t tensors[_IO_NUM] = { NULL };
+    size_t out_elements = 0;
+
+    tensors[0] = (vsi_nn_kernel_tensor_t)param[PARAM_INPUT];
+    tensors[1] = (vsi_nn_kernel_tensor_t)param[PARAM_KERNEL];
+    tensors[2] = (vsi_nn_kernel_tensor_t)param[PARAM_BIAS];
+    tensors[3] = (vsi_nn_kernel_tensor_t)param[PARAM_OUTPUT];
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+    if ( param[PARAM_BIAS] )
+    {
+        attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
+        CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
+    }
+    attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
+    CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final );
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
+
+    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_STRIDE_0], &strides[0] );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_STRIDE_0], &strides[1] );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_0], &pad[0] );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_1], &pad[1] );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_2], &pad[2] );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_2], &pad[3] );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], FALSE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
+
+    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE );
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create kernel buffer fail.", final );
+    if ( param[PARAM_BIAS] )
+    {
+        buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], FALSE );
+        CHECK_PTR_FAIL_GOTO( buffer[2], "Create bias buffer fail.", final );
+    }
+    buffer[3] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[3], attr[3], FALSE );
+    CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final );
+
+    npuref_interface_quant_deconv2d(buffer[0], attr[0],
+        buffer[1], attr[1], buffer[2],
+        pad, strides, dilation, attr[3], buffer[3]);
+
+    status = vsi_nn_kernel_tensor_write( tensors[3], attr[3],
+        buffer[3], out_elements );
+    CHECK_STATUS_FAIL_GOTO( status, final );
+
+final:
+    for ( i = 0; i < _IO_NUM; i ++ )
+    {
+        if ( attr[i] )
+        {
+            vsi_nn_kernel_tensor_attr_release( &attr[i] );
+        }
+        if ( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+    }
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _cpu_backend_deconv2d_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _cpu_backend_deconv2d_kernel_param_def );
+    status = VSI_SUCCESS;
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_CPU_BACKEND_DECONV2D_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    size_t size = 0;
+    int32_t* stride = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "stride", &size);
+    int32_t* pad = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "pad", &size);
+
+    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _CPU_BACKEND_DECONV2D_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &stride[0] );
+            node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &stride[1] );
+            node_params[6] = vsi_nn_kernel_scalar_create( graph, I32, &pad[0] );
+            node_params[7] = vsi_nn_kernel_scalar_create( graph, I32, &pad[1] );
+            node_params[8] = vsi_nn_kernel_scalar_create( graph, I32, &pad[2] );
+            node_params[9] = vsi_nn_kernel_scalar_create( graph, I32, &pad[3] );
+
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _CPU_BACKEND_DECONV2D_PARAM_NUM );
+
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( cpu_backend_deconv2d, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cpu/depthwise_conv1d_cpu.c b/src/tim/vx/internal/src/kernel/cpu/depthwise_conv1d_cpu.c
new file mode 100644
index 0000000..46de624
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/depthwise_conv1d_cpu.c
@@ -0,0 +1,275 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+#include "cpu_backend/npuref_interface.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum
+{
+    PARAM_INPUT = 0,
+    PARAM_KERNEL,
+    PARAM_BIAS,
+    PARAM_OUTPUT,
+    PARAM_STRIDE,
+    PARAM_PAD_FRONT,
+    PARAM_PAD_END,
+    PARAM_DILATION,
+    PARAM_MULTIPLIER,
+    PARAM_NUM
+} param_index_e;
+
+#define _INPUT_NUM          (PARAM_NUM)
+#define _OUTPUT_NUM         (1)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.depthwise_conv1d")
+#define _IO_NUM             (4)
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _depthwise_conv1d_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_OPTIONAL},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _DEPTHWISE_CONV1D_PARAM_NUM  _cnt_of_array( _depthwise_conv1d_kernel_param_def )
+
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_attr_t * attr[_IO_NUM] = { NULL };
+    int32_t stride;
+    int32_t pad_front;
+    int32_t pad_end;
+    int32_t dilation;
+    int32_t multiplier;
+    void * buffer[_IO_NUM] = { NULL };
+    int32_t i = 0;
+    vsi_nn_kernel_tensor_t tensors[_IO_NUM] = { NULL };
+    size_t out_elements = 0;
+
+    tensors[0] = (vsi_nn_kernel_tensor_t)param[PARAM_INPUT];
+    tensors[1] = (vsi_nn_kernel_tensor_t)param[PARAM_KERNEL];
+    tensors[2] = (vsi_nn_kernel_tensor_t)param[PARAM_BIAS];
+    tensors[3] = (vsi_nn_kernel_tensor_t)param[PARAM_OUTPUT];
+    attr[0] = vsi_nn_kernel_tensor_attr_create( tensors[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+    if( param[PARAM_BIAS] )
+    {
+        attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
+        CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final );
+    }
+    attr[3] = vsi_nn_kernel_tensor_attr_create( tensors[3] );
+    CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", final );
+    out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[3] );
+
+    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_STRIDE], &stride );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_FRONT], &pad_front );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_PAD_END], &pad_end );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_DILATION], &dilation );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32( param[PARAM_MULTIPLIER], &multiplier );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], FALSE );
+    CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
+
+    buffer[1] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[1], attr[1], FALSE );
+    CHECK_PTR_FAIL_GOTO( buffer[1], "Create kernel buffer fail.", final );
+    if( param[PARAM_BIAS] )
+    {
+        buffer[2] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[2], attr[2], FALSE );
+        CHECK_PTR_FAIL_GOTO( buffer[2], "Create bias buffer fail.", final );
+    }
+    buffer[3] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[3], attr[3], FALSE );
+    CHECK_PTR_FAIL_GOTO( buffer[3], "Create output buffer fail.", final );
+
+
+    {
+        // Use conv2d compute
+        int32_t input_shape_4d[4] = {1,0,0,0};
+        int32_t kernel_shape_4d[4] = {1,0,0,0};
+        int32_t output_shape_4d[4] = {1,0,0,0};
+        memcpy( &input_shape_4d[1], attr[0]->shape->data, 3 * sizeof(int32_t) );
+        memcpy( &kernel_shape_4d[1], attr[1]->shape->data, 3 * sizeof(int32_t) );
+        memcpy( &output_shape_4d[1], attr[3]->shape->data, 3 * sizeof(int32_t) );
+        npuref_interface_quant_depthwise_conv2d(
+                buffer[0], buffer[1], buffer[2],
+                input_shape_4d, 4,
+                kernel_shape_4d, 4,
+                output_shape_4d, 4,
+                attr[0]->asymm.scale, attr[0]->asymm.zero_point,
+                attr[1]->asymm.scale, attr[1]->asymm.zero_point,
+                attr[3]->asymm.scale, attr[3]->asymm.zero_point,
+                pad_front, pad_end, 0, 0,
+                stride, 1, dilation, 1,
+                buffer[3]
+                );
+        status = vsi_nn_kernel_tensor_write( tensors[3], attr[3],
+                buffer[3], out_elements );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+
+final:
+    for( i = 0; i < _IO_NUM; i ++ )
+    {
+        if( attr[i] )
+        {
+            vsi_nn_kernel_tensor_attr_release( &attr[i] );
+        }
+        if( buffer[i] )
+        {
+            free( buffer[i] );
+        }
+    }
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _depthwise_conv1d_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _depthwise_conv1d_kernel_param_def );
+    status = VSI_SUCCESS;
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_DEPTHWISE_CONV1D_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t stride     = vsi_nn_kernel_param_get_int32( params, "stride" );
+    int32_t pad_front  = vsi_nn_kernel_param_get_int32( params, "pad_front" );
+    int32_t pad_end    = vsi_nn_kernel_param_get_int32( params, "pad_end" );
+    int32_t dilation   = vsi_nn_kernel_param_get_int32( params, "dilation" );
+    int32_t multiplier = vsi_nn_kernel_param_get_int32( params, "multiplier" );
+
+    if(!( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8
+            && inputs[1]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8
+            && outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8))
+    {
+        //TODO: Support other types
+        return NULL;
+    }
+
+    if( !npuref_exists() )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _DEPTHWISE_CONV1D_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[PARAM_STRIDE] = vsi_nn_kernel_scalar_create( graph, I32, &stride );
+            node_params[PARAM_PAD_FRONT] = vsi_nn_kernel_scalar_create( graph, I32, &pad_front );
+            node_params[PARAM_PAD_END] = vsi_nn_kernel_scalar_create( graph, I32, &pad_end );
+            node_params[PARAM_DILATION] = vsi_nn_kernel_scalar_create( graph, I32, &dilation );
+            node_params[PARAM_MULTIPLIER] = vsi_nn_kernel_scalar_create( graph, I32, &multiplier );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params,
+                    _DEPTHWISE_CONV1D_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[PARAM_STRIDE] );
+            vsi_nn_kernel_scalar_release( &node_params[PARAM_PAD_FRONT] );
+            vsi_nn_kernel_scalar_release( &node_params[PARAM_PAD_END] );
+            vsi_nn_kernel_scalar_release( &node_params[PARAM_DILATION] );
+            vsi_nn_kernel_scalar_release( &node_params[PARAM_MULTIPLIER] );
+        }
+        else
+        {
+            status = VSI_FAILURE;
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( depthwise_conv1d, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
index a00cfcb..3aa63e2 100644
--- a/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/eltwise_unary_cpu.c
@@ -52,7 +52,7 @@ typedef enum
 } unary_type_e;
 
 
-#define _CPU_ARG_NUM            (2)
+#define _CPU_ARG_NUM            (3)
 #define _CPU_INPUT_NUM          (1)
 #define _CPU_OUTPUT_NUM         (1)
 #define _CPU_IO_NUM             (_CPU_INPUT_NUM + _CPU_OUTPUT_NUM)
@@ -84,9 +84,9 @@ static float neg_eval(float data)
     return data * -1.0f;
 }
 
-static float hsigmoid_eval(float data)
+static float hsigmoid_eval(float data, float alpha, float beta)
 {
-    data = (float)(0.2 * data + 0.5);
+    data = (float)(alpha * data + beta);
     data = vsi_nn_clamp(data, 0, 1);
 
     return data;
@@ -177,6 +177,7 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
     vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL };
     int32_t i;
     float alpha = 0;
+    float beta = 0;
     int32_t unary_type = 0;
 
     tensors[0]  = (vsi_nn_kernel_tensor_t)param[0];
@@ -191,6 +192,8 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
     CHECK_STATUS_FAIL_GOTO(status, final );
     status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[3], &alpha);
     CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[4], &beta);
+    CHECK_STATUS_FAIL_GOTO(status, final );
 
     buffer[0] = (float*)vsi_nn_kernel_tensor_create_buffer( tensors[0], attr[0], TRUE );
     CHECK_PTR_FAIL_GOTO( buffer[0], "Create input buffer fail.", final );
@@ -222,7 +225,7 @@ DEF_KERNEL_EXECUTOR(_eltwise_unary_exec)
             data = neg_eval(data);
             break;
         case UNARY_HSIGMOID:
-            data = hsigmoid_eval(data);
+            data = hsigmoid_eval(data, alpha, beta);
             break;
         case UNARY_MISH:
             data = mish_eval(data);
@@ -268,10 +271,12 @@ static vx_param_description_t kernel_param_def[] =
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 
 #define INPUT_FUNC_TYPE           (2)
 #define INPUT_SCALAR_ALPHA        (3)
+#define INPUT_SCALAR_BETA         (4)
 
 static const vx_kernel_description_t _kernel_info =
 {
@@ -314,6 +319,7 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL};
     vsi_nn_kernel_node_t node = NULL;
     float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" );
+    float beta = vsi_nn_kernel_param_get_float32( params, "beta" );
 
     status = _query_kernel( inputs, outputs, kernel );
     if( VSI_SUCCESS == status)
@@ -328,11 +334,14 @@ static vsi_nn_kernel_node_t _setup
                     graph, I32, &unary_type );
             backend_params[INPUT_SCALAR_ALPHA] = vsi_nn_kernel_scalar_create(
                     graph, F32, &alpha );
+            backend_params[INPUT_SCALAR_BETA] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &beta );
             /* Pass parameters to node. */
             status = vsi_nn_kernel_node_pass_param( node, backend_params, _CPU_PARAM_NUM );
 
             vsi_nn_kernel_scalar_release( &backend_params[INPUT_FUNC_TYPE] );
             vsi_nn_kernel_scalar_release( &backend_params[INPUT_SCALAR_ALPHA] );
+            vsi_nn_kernel_scalar_release( &backend_params[INPUT_SCALAR_BETA] );
         }
         else
         {
diff --git a/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c b/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c
index 13d10e7..33e8b33 100644
--- a/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c
@@ -100,9 +100,9 @@ DEF_KERNEL_EXECUTOR(_gather_nd_exec)
     }
     indices_num /= coord_stride;
 
-    if(coord_stride <= 3) // reshape 3D
+    if(coord_stride <= 4) // reshape 3D
     {
-        vsi_ssize_t stride[3] = {block_size, 0, 0};
+        vsi_ssize_t stride[4] = {block_size, 0, 0, 0};
         for(i = 1; i < coord_stride; ++i)
         {
             stride[i] = stride[i - 1] * attr[0]->shape->data[i];
@@ -111,7 +111,7 @@ DEF_KERNEL_EXECUTOR(_gather_nd_exec)
         for(i = 0; i < indices_num; i++)
         {
             vsi_size_t out_index = i * block_size;
-            uint32_t coord[3] = {0};
+            uint32_t coord[4] = {0};
             vsi_size_t in_index = 0;
             int32_t j = 0;
 
@@ -119,7 +119,7 @@ DEF_KERNEL_EXECUTOR(_gather_nd_exec)
             {
                 coord[j] = buffer_idx[i * coord_stride + j];
             }
-            in_index = coord[2] * stride[2] + coord[1] * stride[1] + coord[0] * stride[0];
+            in_index = coord[3] * stride[3] + coord[2] * stride[2] + coord[1] * stride[1] + coord[0] * stride[0];
             memcpy(&(buffer[1][out_index]), &(buffer[0][in_index]), block_size * sizeof(float));
         }
     }
diff --git a/src/tim/vx/internal/src/kernel/cpu/generate_proposals_cpu.c b/src/tim/vx/internal/src/kernel/cpu/generate_proposals_cpu.c
new file mode 100644
index 0000000..f764c18
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/generate_proposals_cpu.c
@@ -0,0 +1,507 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (4)
+#define _OUTPUT_NUM         (3)
+ #define _TENSOR_NUM        (_INPUT_NUM + _OUTPUT_NUM)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.generate_proposals")
+
+
+typedef struct vsi_nn_box_encoding_corner_t
+{
+    float x1, y1, x2, y2;
+}vsi_nn_box_encoding_corner;
+
+typedef struct vsi_nn_box_encoding_center_t
+{
+    float w, h, x, y;
+}vsi_nn_box_encoding_center;
+/*
+ * Kernel params
+ */
+static vx_param_description_t _generate_proposals_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _GENERATE_PROPOSALS_PARAM_NUM  _cnt_of_array( _generate_proposals_kernel_param_def )
+
+
+static void _to_box_encoding_corner
+    (
+    vsi_nn_box_encoding_center* ctr,
+    vsi_nn_box_encoding_corner* cnr
+    )
+{
+    cnr->x1 = ctr->x - ctr->w / 2;
+    cnr->y1 = ctr->y - ctr->h / 2;
+    cnr->x2 = ctr->x + ctr->w / 2;
+    cnr->y2 = ctr->y + ctr->h / 2;
+}
+
+static void _to_box_encoding_center
+    (
+    vsi_nn_box_encoding_corner* cnr,
+    vsi_nn_box_encoding_center* ctr
+    )
+{
+    ctr->w = cnr->x2 - cnr->x1;
+    ctr->h = cnr->y2 - cnr->y1;
+    ctr->x = (cnr->x1 + cnr->x2) / 2;
+    ctr->y = (cnr->y1 + cnr->y2) / 2;
+}
+
+static void _iota
+    (
+    int32_t * data,
+    uint32_t len,
+    int32_t value
+    )
+{
+    uint32_t i;
+    for (i = 0; i < len; i++)
+    {
+        data [i] = value;
+        value++;
+    }
+}
+
+// swap_element is implemented in vsi_nn_kernel_box_with_nms_limit.c
+void swap_element
+    (
+    uint32_t* list,
+    uint32_t first,
+    uint32_t second
+    );
+
+// max_element is implemented in vsi_nn_kernel_box_with_nms_limit.c
+uint32_t max_element
+    (
+    float* data,
+    uint32_t* index_list,
+    uint32_t len
+    );
+
+// getIoUAxisAligned is implemented in vsi_nn_kernel_box_with_nms_limit.c
+float getIoUAxisAligned
+    (
+    const float* roi1,
+    const float* roi2
+    );
+
+// sort_element_by_score is implemented in vsi_nn_kernel_box_with_nms_limit.c
+void sort_element_by_score
+    (
+    float* data,
+    uint32_t* index_list,
+    uint32_t len
+    );
+
+void _filter_boxes
+    (
+    const float* roiBase,
+    const float* imageInfoBase,
+    float minSize,
+    uint32_t* select,
+    uint32_t* len
+    )
+{
+    const uint32_t kRoiDim = 4;
+    uint32_t i = 0;
+    uint32_t j = 0;
+
+    for (j = 0; j < *len; j++)
+    {
+        const float* roiInfo = roiBase + select[j] * kRoiDim;
+        float roiWidth, roiHeight, xRoiCenter, yRoiCenter;
+        roiWidth = roiInfo[2] - roiInfo[0];
+        roiHeight = roiInfo[3] - roiInfo[1];
+        xRoiCenter = roiInfo[0] + roiWidth / 2.0f;
+        yRoiCenter = roiInfo[1] + roiHeight / 2.0f;
+        if (roiWidth > minSize && roiHeight > minSize && xRoiCenter < imageInfoBase[1]
+            && yRoiCenter < imageInfoBase[0])
+        {
+            select[i] = select[j];
+            i++;
+        }
+    }
+    *len = i;
+}
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    float *f32_in_buffer[_INPUT_NUM] = {NULL};
+    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
+    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
+    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
+    uint32_t  i;
+    float heightStride;
+    float widthStride;
+    int32_t preNmsTopN;
+    int32_t postNmsTopN;
+    float iouThreshold;
+    float minSize;
+
+    /* prepare data */
+    for (i = 0; i < _INPUT_NUM; i ++)
+    {
+        input[i] = (vsi_nn_kernel_tensor_t)param[i];
+        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
+        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
+        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
+
+    }
+    for (i = 0; i < _OUTPUT_NUM; i ++)
+    {
+        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
+        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
+        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
+        out_bytes[i] = out_elements[i] * sizeof(float);
+        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
+        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
+        memset( f32_out_buffer[i], 0, out_bytes[i] );
+    }
+
+    status = vsi_nn_kernel_scalar_read_float32( param[_TENSOR_NUM], &heightStride );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_float32( param[_TENSOR_NUM + 1], &widthStride );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32( param[_TENSOR_NUM + 2], &preNmsTopN );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32( param[_TENSOR_NUM + 3], &postNmsTopN );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_float32( param[_TENSOR_NUM + 4], &iouThreshold );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_float32( param[_TENSOR_NUM + 5], &minSize );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    {
+        uint32_t h, w, a, b, j;
+        const uint32_t kRoiDim = 4;
+        vsi_size_t numBatches = in_attr[0]->shape->data[3];
+        vsi_size_t height = in_attr[0]->shape->data[2];
+        vsi_size_t width = in_attr[0]->shape->data[1];
+        vsi_size_t numAnchors = in_attr[0]->shape->data[0];
+        vsi_size_t imageInfoLength = in_attr[3]->shape->data[0];
+
+        vsi_size_t batchSize = height * width * numAnchors;
+        vsi_size_t roiBufferSize = batchSize * kRoiDim;
+
+        float * roiBuffer = (float*)malloc(roiBufferSize * sizeof(float));
+        float * roiTransformedBuffer = (float*)malloc(roiBufferSize * sizeof(float));
+        uint32_t* select = (uint32_t*)malloc(batchSize * sizeof(uint32_t));
+        uint32_t index = 0;
+        vsi_size_t scores_index = 0;
+        vsi_size_t bboxDeltas_index = 0;
+        vsi_size_t imageInfo_index = 0;
+        uint32_t scores_out_index = 0;
+        uint32_t roi_out_index = 0;
+
+        // Compute the roi region for each anchor.
+        for(h = 0; h < height; h++)
+        {
+            float hShift = h * heightStride;
+            for(w = 0; w < width; w++)
+            {
+                float wShift = w * widthStride;
+                uint32_t anchor_index = 0;
+                for(a = 0; a < numAnchors; a++)
+                {
+                    roiBuffer[index] = f32_in_buffer[2][anchor_index] + wShift;
+                    roiBuffer[index + 1] = f32_in_buffer[2][anchor_index + 1] + hShift;
+                    roiBuffer[index + 2] = f32_in_buffer[2][anchor_index + 2] + wShift;
+                    roiBuffer[index + 3] = f32_in_buffer[2][anchor_index + 3] + hShift;
+
+                    index += kRoiDim;
+                    anchor_index += kRoiDim;
+                }
+            }
+        }
+
+        for (b = 0; b < numBatches; b++)
+        {
+            const uint32_t roiLength = 4;
+
+            vsi_size_t numRois = batchSize;
+            vsi_size_t roiIndex;
+            uint32_t select_len;
+            int32_t numDetections = 0;
+            for (roiIndex = 0; roiIndex < numRois; roiIndex++)
+            {
+                float imageHeight = f32_in_buffer[3][imageInfo_index];
+                float imageWidth = f32_in_buffer[3][imageInfo_index + 1];
+                vsi_nn_box_encoding_corner roi_cnr;
+                vsi_nn_box_encoding_center roiBefore;
+                roi_cnr.x1 = roiBuffer[roiIndex * roiLength];
+                roi_cnr.y1 = roiBuffer[roiIndex * roiLength + 1];
+                roi_cnr.x2 = roiBuffer[roiIndex * roiLength + 2];
+                roi_cnr.y2 = roiBuffer[roiIndex * roiLength + 3];
+                _to_box_encoding_center(&roi_cnr, &roiBefore);
+                {
+                    vsi_nn_box_encoding_center roi_ctr;
+                    vsi_nn_box_encoding_corner roiAfter;
+                    vsi_nn_box_encoding_corner cliped;
+                    vsi_size_t idx = bboxDeltas_index + roiIndex * roiLength;
+                    roi_ctr.w = (float)(exp(f32_in_buffer[1][idx + 2]) * roiBefore.w);
+                    roi_ctr.h = (float)(exp(f32_in_buffer[1][idx + 3]) * roiBefore.h);
+                    roi_ctr.x = roiBefore.x + f32_in_buffer[1][idx] * roiBefore.w;
+                    roi_ctr.y = roiBefore.y + f32_in_buffer[1][idx + 1] * roiBefore.h;
+                    _to_box_encoding_corner(&roi_ctr, &roiAfter);
+                    cliped.x1 = vsi_nn_min(vsi_nn_max(roiAfter.x1, 0.0f), imageWidth);
+                    cliped.y1 = vsi_nn_min(vsi_nn_max(roiAfter.y1, 0.0f), imageHeight);
+                    cliped.x2 = vsi_nn_min(vsi_nn_max(roiAfter.x2, 0.0f), imageWidth);
+                    cliped.y2 = vsi_nn_min(vsi_nn_max(roiAfter.y2, 0.0f), imageHeight);
+                    roiTransformedBuffer[idx] = cliped.x1;
+                    roiTransformedBuffer[idx + 1] = cliped.y1;
+                    roiTransformedBuffer[idx + 2] = cliped.x2;
+                    roiTransformedBuffer[idx + 3] = cliped.y2;
+                }
+            }
+
+            // Find the top preNmsTopN scores.
+            _iota((int32_t*)select, (uint32_t)batchSize, 0);
+            select_len = (uint32_t)batchSize;
+            if(preNmsTopN > 0 && preNmsTopN < (int32_t)batchSize)
+            {
+                sort_element_by_score(&(f32_in_buffer[0][scores_index]),
+                    select, (uint32_t)batchSize);
+                select_len = preNmsTopN;
+            }
+
+            // Filter boxes, disgard regions with height or width < minSize.
+            _filter_boxes(roiTransformedBuffer, &(f32_in_buffer[3][0]),
+                minSize, select, &select_len);
+
+            // Apply hard NMS.
+            if (postNmsTopN < 0)
+            {
+                postNmsTopN = select_len;
+            }
+
+            for (j = 0; (j < select_len && numDetections < postNmsTopN); j++)
+            {
+                // find max score and swap to the front.
+                int32_t max_index = max_element(&(f32_in_buffer[0][scores_index]),
+                    &(select[j]), select_len - j) + j;
+                swap_element(select, max_index, j);
+
+                // Calculate IoU of the rest, swap to the end (disgard) ifneeded.
+                for (i = j + 1; i < select_len; i++)
+                {
+                    int32_t roiBase0 = select[i] * kRoiDim;
+                    int32_t roiBase1 = select[j] * kRoiDim;
+                    float iou = getIoUAxisAligned(&(roiTransformedBuffer[roiBase0]),
+                        &(roiTransformedBuffer[roiBase1]));
+
+                    if (iou >= iouThreshold)
+                    {
+                        swap_element(select, i, select_len - 1);
+                        i--;
+                        select_len--;
+                    }
+                }
+                numDetections++;
+            }
+
+            for (i = 0; i < select_len; i++)
+            {
+                memcpy(&(f32_out_buffer[1][roi_out_index]),
+                    &(roiTransformedBuffer[select[i] * kRoiDim]), kRoiDim * sizeof(float));
+                f32_out_buffer[0][scores_out_index] =
+                    f32_in_buffer[0][scores_index + select[i]];
+                f32_out_buffer[2][scores_out_index] = (float)b;
+                scores_out_index++;
+                roi_out_index += kRoiDim;
+            }
+
+            scores_index += batchSize;
+            bboxDeltas_index += roiBufferSize;
+            imageInfo_index += imageInfoLength;
+        }
+
+        vsi_nn_safe_free(roiBuffer);
+        vsi_nn_safe_free(roiTransformedBuffer);
+        vsi_nn_safe_free(select);
+    }
+
+    /* save data */
+    for(i = 0; i < _OUTPUT_NUM; i++)
+    {
+        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
+                f32_out_buffer[i], out_elements[i] );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+
+final:
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        if (f32_in_buffer[i])
+        {
+            free(f32_in_buffer[i]);
+            f32_in_buffer[i] = NULL;
+        }
+        if (in_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
+        }
+    }
+    for(i = 0; i < _OUTPUT_NUM; i++)
+    {
+        if (f32_out_buffer[i])
+        {
+            free(f32_out_buffer[i]);
+            f32_out_buffer[i] = NULL;
+        }
+        if (out_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
+        }
+    }
+
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _generate_proposals_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _generate_proposals_kernel_param_def );
+    status = VSI_SUCCESS;
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_GENERATE_PROPOSALS_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    float height_stride = vsi_nn_kernel_param_get_float32( params, "height_stride");
+    float width_stride = vsi_nn_kernel_param_get_float32( params, "width_stride");
+    int32_t pre_nms_top_n = vsi_nn_kernel_param_get_int32( params, "pre_nms_top_n");
+    int32_t post_nms_top_n = vsi_nn_kernel_param_get_int32( params, "post_nms_top_n");
+    float iou_threshold = vsi_nn_kernel_param_get_float32(params, "iou_threshold");
+    float min_size = vsi_nn_kernel_param_get_float32(params, "min_size");
+
+    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+    if( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _GENERATE_PROPOSALS_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[_TENSOR_NUM    ] = vsi_nn_kernel_scalar_create( graph, F32, &height_stride );
+            node_params[_TENSOR_NUM + 1] = vsi_nn_kernel_scalar_create( graph, F32, &width_stride );
+            node_params[_TENSOR_NUM + 2] = vsi_nn_kernel_scalar_create( graph, I32, &pre_nms_top_n );
+            node_params[_TENSOR_NUM + 3] = vsi_nn_kernel_scalar_create( graph, I32, &post_nms_top_n );
+            node_params[_TENSOR_NUM + 4] = vsi_nn_kernel_scalar_create( graph, F32, &iou_threshold );
+            node_params[_TENSOR_NUM + 5] = vsi_nn_kernel_scalar_create( graph, F32, &min_size );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GENERATE_PROPOSALS_PARAM_NUM );
+
+            vsi_nn_kernel_scalar_release( &node_params[_TENSOR_NUM    ] );
+            vsi_nn_kernel_scalar_release( &node_params[_TENSOR_NUM + 1] );
+            vsi_nn_kernel_scalar_release( &node_params[_TENSOR_NUM + 2] );
+            vsi_nn_kernel_scalar_release( &node_params[_TENSOR_NUM + 3] );
+            vsi_nn_kernel_scalar_release( &node_params[_TENSOR_NUM + 4] );
+            vsi_nn_kernel_scalar_release( &node_params[_TENSOR_NUM + 5] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( generate_proposals, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cpu/grucell_activation_z_h_cpu.c b/src/tim/vx/internal/src/kernel/cpu/grucell_activation_z_h_cpu.c
new file mode 100644
index 0000000..a5bd220
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/grucell_activation_z_h_cpu.c
@@ -0,0 +1,261 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (GRUCELL_ACT_Z_H_IN_CNT)
+#define _OUTPUT_NUM         (GRUCELL_ACT_Z_H_OUT_CNT)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.grucell_activation_z_h")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _grucell_activation_z_h_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },  /*activation*/
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },  /*recurrent_activation*/
+};
+#define _GRUCELL_ACTIVATION_Z_H_PARAM_NUM  _cnt_of_array( _grucell_activation_z_h_kernel_param_def )
+#define SCALAR_ACTIVATION          (7)
+#define SCALAR_R_ACTIVATION        (8)
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM]   = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    float *f32_in_buffer[_INPUT_NUM]   = {NULL};
+    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
+    vsi_size_t   in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM]   = {{1}};
+    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
+    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
+    vsi_size_t   out_bytes[_OUTPUT_NUM]    = {0};
+    vsi_size_t i, b;
+    int32_t  activation = 0;
+    int32_t  recurrent_activation = 0;
+    vsi_size_t n_batch               = 0;
+    vsi_size_t n_cell                = 0;
+
+    /* prepare data */
+    for ( i = 0; i < _INPUT_NUM; i++ )
+    {
+        input[i]   = (vsi_nn_kernel_tensor_t)param[i];
+        if (input[i])
+        {
+            in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
+            vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] );
+            f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
+            CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
+        }
+    }
+
+    for ( i = 0; i < _OUTPUT_NUM; i++ )
+    {
+        output[i]   = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        if (output[i])
+        {
+            out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
+            vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
+            out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
+            out_bytes[i] = out_elements[i] * sizeof(float);
+            f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
+            CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
+            memset( f32_out_buffer[i], 0, out_bytes[i] );
+        }
+    }
+
+    status  = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_ACTIVATION], &activation );
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_R_ACTIVATION],
+        &recurrent_activation );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    n_cell  = in_attr[GRUCELL_ACT_Z_H_HSTATE]->shape->data[0];
+    n_batch = in_attr[GRUCELL_ACT_Z_H_HSTATE]->shape->data[1];
+
+    for (b = 0; b < n_batch; b ++)
+    {
+        for (i = 0; i < n_cell; i++)
+        {
+            vsi_size_t index = i + n_cell * b;
+            float data_z_t = 0;
+            float data_h_t = 0;
+            float hstate_in = f32_in_buffer[GRUCELL_ACT_Z_H_HSTATE][index];
+            float dst = 0;
+
+            data_z_t = f32_in_buffer[GRUCELL_ACT_Z_H_I_FC_Z][index];
+            data_z_t += f32_in_buffer[GRUCELL_ACT_Z_H_H_FC_Z][index];
+            data_z_t = vsi_nn_activation(data_z_t, recurrent_activation);
+
+            data_h_t = f32_in_buffer[GRUCELL_ACT_Z_H_I_FC_H][index];
+            data_h_t += f32_in_buffer[GRUCELL_ACT_Z_H_H_FC_H][index];
+            data_h_t = vsi_nn_activation(data_h_t, activation);
+
+            dst = (1 - data_z_t ) * data_h_t + data_z_t * hstate_in;
+
+            f32_out_buffer[GRUCELL_ACT_Z_H_OUT_OUTPUT][index] = dst;
+            f32_out_buffer[GRUCELL_ACT_Z_H_OUT_HSTATE][index] = dst;
+        }
+    }
+
+    /* save data */
+    for (i = 0; i < _OUTPUT_NUM; i++)
+    {
+        if (output[i])
+        {
+            status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
+                    f32_out_buffer[i], out_elements[i] );
+            CHECK_STATUS_FAIL_GOTO( status, final );
+        }
+    }
+
+final:
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        if (f32_in_buffer[i])
+        {
+            free(f32_in_buffer[i]);
+            f32_in_buffer[i] = NULL;
+        }
+
+        if (in_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
+        }
+    }
+    for (i = 0; i < _OUTPUT_NUM; i++)
+    {
+        if (f32_out_buffer[i])
+        {
+            free(f32_out_buffer[i]);
+            f32_out_buffer[i] = NULL;
+        }
+
+        if (out_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
+        }
+    }
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _grucell_activation_z_h_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _grucell_activation_z_h_kernel_param_def );
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_GRUCELL_ACTIVATION_Z_H_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t activation = vsi_nn_kernel_param_get_int32( params, "activation" );
+    int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" );
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_ACTIVATION_Z_H_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            node_params[SCALAR_ACTIVATION] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &activation );
+            node_params[SCALAR_R_ACTIVATION] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &recurrent_activation );
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_ACTIVATION_Z_H_PARAM_NUM );
+
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_ACTIVATION] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_R_ACTIVATION] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( grucell_activation_z_h, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/grucell_h_times_activation_r_cpu.c b/src/tim/vx/internal/src/kernel/cpu/grucell_h_times_activation_r_cpu.c
new file mode 100644
index 0000000..b61f92e
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/grucell_h_times_activation_r_cpu.c
@@ -0,0 +1,245 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (3)
+#define _OUTPUT_NUM         (1)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.grucell_h_times_activation_r")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _grucell_h_times_activation_r_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },  /*recurrent_activation*/
+    // Add kererl parameters here
+};
+#define _GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM  _cnt_of_array( _grucell_h_times_activation_r_kernel_param_def )
+#define SCALAR_R_ACTIVATION        (4)
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM]   = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    float *f32_in_buffer[_INPUT_NUM]   = {NULL};
+    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
+    vsi_size_t   in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM]   = {{1}};
+    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
+    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
+    vsi_size_t   out_bytes[_OUTPUT_NUM]    = {0};
+    vsi_size_t i, b;
+    int32_t  recurrent_activation = 0;
+    vsi_size_t n_batch               = 0;
+    vsi_size_t n_cell                = 0;
+
+    /* prepare data */
+    for( i = 0; i < _INPUT_NUM; i++ )
+    {
+        input[i]   = (vsi_nn_kernel_tensor_t)param[i];
+        if (input[i])
+        {
+            in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
+            vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] );
+            f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
+            CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
+        }
+    }
+
+    for( i = 0; i < _OUTPUT_NUM; i++ )
+    {
+        output[i]   = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        if (output[i])
+        {
+            out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
+            vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
+            out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
+            out_bytes[i] = out_elements[i] * sizeof(float);
+            f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
+            CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
+            memset( f32_out_buffer[i], 0, out_bytes[i] );
+        }
+    }
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_R_ACTIVATION],
+        &recurrent_activation );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    n_cell  = in_attr[0]->shape->data[0];
+    n_batch = in_attr[0]->shape->data[1];
+
+    for (b = 0; b < n_batch; b ++)
+    {
+        for (i = 0; i < n_cell; i++)
+        {
+            vsi_size_t index = i + n_cell * b;
+            float data_r_t = 0;
+            float r_times_h = 0;
+            float hstate_in = f32_in_buffer[0][index];
+
+            data_r_t = f32_in_buffer[1][index];
+            data_r_t += f32_in_buffer[2][index];
+
+            data_r_t = vsi_nn_activation(data_r_t, recurrent_activation);
+
+            r_times_h = hstate_in * data_r_t;
+
+            f32_out_buffer[0][index] = r_times_h;
+        }
+    }
+
+    /* save data */
+    for(i = 0; i < _OUTPUT_NUM; i++)
+    {
+        if (output[i])
+        {
+            status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
+                    f32_out_buffer[i], out_elements[i] );
+            CHECK_STATUS_FAIL_GOTO( status, final );
+        }
+    }
+
+final:
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        if (f32_in_buffer[i])
+        {
+            free(f32_in_buffer[i]);
+            f32_in_buffer[i] = NULL;
+        }
+
+        if (in_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
+        }
+    }
+    for(i = 0; i < _OUTPUT_NUM; i++)
+    {
+        if (f32_out_buffer[i])
+        {
+            free(f32_out_buffer[i]);
+            f32_out_buffer[i] = NULL;
+        }
+
+        if (out_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
+        }
+    }
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _grucell_h_times_activation_r_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _grucell_h_times_activation_r_kernel_param_def );
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" );
+
+    status = _query_kernel( kernel, inputs, outputs /* Add extra params */ );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[SCALAR_R_ACTIVATION] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &recurrent_activation );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_R_ACTIVATION] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( grucell_h_times_activation_r, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/grucell_reset_after_activation_cpu.c b/src/tim/vx/internal/src/kernel/cpu/grucell_reset_after_activation_cpu.c
new file mode 100644
index 0000000..cfd0eb1
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/grucell_reset_after_activation_cpu.c
@@ -0,0 +1,271 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (GRUCELL_ACT_IN_CNT)
+#define _OUTPUT_NUM         (GRUCELL_ACT_OUT_CNT)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.grucell_reset_after_activation")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _grucell_reset_after_activation_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },  /*activation*/
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED },  /*recurrent_activation*/
+    // Add kererl parameters here
+};
+#define _GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM  _cnt_of_array( _grucell_reset_after_activation_kernel_param_def )
+#define SCALAR_ACTIVATION          (9)
+#define SCALAR_R_ACTIVATION        (10)
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM]   = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    float *f32_in_buffer[_INPUT_NUM]   = {NULL};
+    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL};
+    vsi_size_t   in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM]   = {{1}};
+    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
+    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
+    vsi_size_t   out_bytes[_OUTPUT_NUM]    = {0};
+    vsi_size_t i, b;
+    int32_t  activation = 0;
+    int32_t  recurrent_activation = 0;
+    vsi_size_t n_batch               = 0;
+    vsi_size_t n_cell                = 0;
+
+    /* prepare data */
+    for ( i = 0; i < _INPUT_NUM; i++ )
+    {
+        input[i]   = (vsi_nn_kernel_tensor_t)param[i];
+        if (input[i])
+        {
+            in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
+            vsi_nn_kernel_tensor_attr_get_stride( in_attr[i], in_stride_size[i] );
+            f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
+            CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
+        }
+    }
+
+    for ( i = 0; i < _OUTPUT_NUM; i++ )
+    {
+        output[i]   = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        if (output[i])
+        {
+            out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
+            vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
+            out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
+            out_bytes[i] = out_elements[i] * sizeof(float);
+            f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
+            CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
+            memset( f32_out_buffer[i], 0, out_bytes[i] );
+        }
+    }
+
+    status  = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_ACTIVATION], &activation );
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_R_ACTIVATION],
+        &recurrent_activation );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    n_cell  = in_attr[GRUCELL_ACT_H_STATE]->shape->data[0];
+    n_batch = in_attr[GRUCELL_ACT_H_STATE]->shape->data[1];
+
+    for (b = 0; b < n_batch; b ++)
+    {
+        for (i = 0; i < n_cell; i++)
+        {
+            vsi_size_t index = i + n_cell * b;
+            float data_z_t = 0;
+            float data_r_t = 0;
+            float data_h_t = 0;
+            float r_times_h = 0;
+            float hstate_in = f32_in_buffer[GRUCELL_ACT_H_STATE][index];
+            float dst = 0;
+
+            data_z_t = f32_in_buffer[GRUCELL_ACT_I_FC_Z][index];
+            data_r_t = f32_in_buffer[GRUCELL_ACT_I_FC_R][index];
+            data_h_t = f32_in_buffer[GRUCELL_ACT_I_FC_H][index];
+            data_z_t += f32_in_buffer[GRUCELL_ACT_H_FC_Z][index];
+            data_r_t += f32_in_buffer[GRUCELL_ACT_H_FC_R][index];
+
+            data_z_t = vsi_nn_activation(data_z_t, recurrent_activation);
+            data_r_t = vsi_nn_activation(data_r_t, recurrent_activation);
+
+            r_times_h = f32_in_buffer[GRUCELL_ACT_H_FC_H][index] * data_r_t;
+            data_h_t += r_times_h;
+
+            data_h_t = vsi_nn_activation(data_h_t, activation);
+
+            dst = (1 - data_z_t ) * data_h_t + data_z_t * hstate_in;
+
+            f32_out_buffer[GRUCELL_ACT_OUT_OUTPUT][index] = dst;
+            f32_out_buffer[GRUCELL_ACT_OUT_H_STATE][index] = dst;
+        }
+    }
+
+    /* save data */
+    for (i = 0; i < _OUTPUT_NUM; i++)
+    {
+        if (output[i])
+        {
+            status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
+                    f32_out_buffer[i], out_elements[i] );
+            CHECK_STATUS_FAIL_GOTO( status, final );
+        }
+    }
+
+final:
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        if (f32_in_buffer[i])
+        {
+            free(f32_in_buffer[i]);
+            f32_in_buffer[i] = NULL;
+        }
+
+        if (in_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
+        }
+    }
+    for (i = 0; i < _OUTPUT_NUM; i++)
+    {
+        if (f32_out_buffer[i])
+        {
+            free(f32_out_buffer[i]);
+            f32_out_buffer[i] = NULL;
+        }
+
+        if (out_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
+        }
+    }
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    /* Add extra params */
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _grucell_reset_after_activation_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _grucell_reset_after_activation_kernel_param_def );
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t activation = vsi_nn_kernel_param_get_int32( params, "activation" );
+    int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" );
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[SCALAR_ACTIVATION] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &activation );
+            node_params[SCALAR_R_ACTIVATION] = vsi_nn_kernel_scalar_create(
+                    graph, I32, &recurrent_activation );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM );
+
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_ACTIVATION] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_R_ACTIVATION] );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( grucell_reset_after_activation, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c b/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c
index 94b64d6..eff26ed 100644
--- a/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c
@@ -87,8 +87,8 @@ DEF_KERNEL_EXECUTOR(_prelu_exec)
     attr[1] = vsi_nn_kernel_tensor_attr_create( tensors[1] );
     attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] );
 
-    vsi_nn_kernel_tensor_attr_get_stride( attr[0], stride_size[0] );
-    vsi_nn_kernel_tensor_attr_get_stride( attr[1], stride_size[1] );
+    vsi_nn_shape_get_stride( attr[0]->shape->data, (vsi_size_t)attr[0]->shape->size, stride_size[0] );
+    vsi_nn_shape_get_stride( attr[1]->shape->data, (vsi_size_t)attr[1]->shape->size, stride_size[1] );
 
     out_elements = vsi_nn_kernel_tensor_attr_get_size( attr[2] );
 
diff --git a/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_nhwc_cpu.c b/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_nhwc_cpu.c
new file mode 100644
index 0000000..adb0620
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_nhwc_cpu.c
@@ -0,0 +1,307 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+#define _KERNEL_NAME        CVIVANTE_NAMESPACE("cpu.resize_bilinear_nhwc")
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _resize_bilinear_nhwc_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _RESIZE_BILINEAR_NHWC_PARAM_NUM  _cnt_of_array( _resize_bilinear_nhwc_kernel_param_def )
+
+#define SCALAR_ALIGN_CORNERS         (2)
+#define SCALAR_HALF_PIXEL            (3)
+
+/*
+ * Kernel function
+ */
+DEF_KERNEL_EXECUTOR(_compute)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t input[_INPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL};
+    float *f32_in_buffer[_INPUT_NUM] = {NULL};
+    float *f32_out_buffer[_OUTPUT_NUM] = {NULL};
+    vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM];
+    vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM];
+    vsi_size_t   out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}};
+    vsi_size_t   out_elements[_OUTPUT_NUM] = {0};
+    vsi_size_t   out_bytes[_OUTPUT_NUM] = {0};
+    uint32_t i;
+    int32_t  align_corners;
+    int32_t  half_pixel_centers;
+    float    width_scale;
+    float    height_scale;
+    vsi_size_t input_width, output_width, input_height, output_height;
+    vsi_size_t b = 0, d = 0, w = 0, h = 0;
+    vsi_size_t output_depth, input_depth;
+    vsi_size_t output_batch;
+    vsi_size_t output_dims;
+    float    data00 = .0f, data01 = .0f, data10 = .0f, data11 = .0f, interpolation = .0f;
+    vsi_size_t input_width_orig;
+    vsi_size_t output_width_orig;
+    vsi_size_t index;
+
+    /* prepare data */
+    for(i = 0; i < _INPUT_NUM; i ++)
+    {
+        input[i] = (vsi_nn_kernel_tensor_t)param[i];
+        in_attr[i] = vsi_nn_kernel_tensor_attr_create( input[i] );
+        f32_in_buffer[i] = (float*)vsi_nn_kernel_tensor_create_buffer( input[i], in_attr[i], TRUE );
+        CHECK_PTR_FAIL_GOTO( f32_in_buffer[i], "Create input0 buffer fail.", final );
+    }
+    for(i = 0; i < _OUTPUT_NUM; i ++)
+    {
+        output[i] = (vsi_nn_kernel_tensor_t)param[i + _INPUT_NUM];
+        out_attr[i] = vsi_nn_kernel_tensor_attr_create( output[i] );
+        vsi_nn_kernel_tensor_attr_get_stride( out_attr[i], out_stride_size[i] );
+        out_elements[i] = vsi_nn_kernel_tensor_attr_get_size( out_attr[i] );
+        out_bytes[i] = out_elements[i] * sizeof(float);
+        f32_out_buffer[i] = (float *)malloc( out_bytes[i] );
+        CHECK_PTR_FAIL_GOTO( f32_out_buffer[i], "Create output buffer fail.", final );
+        memset( f32_out_buffer[i], 0, out_bytes[i] );
+    }
+
+    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_ALIGN_CORNERS], &(align_corners));
+    vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_HALF_PIXEL], &(half_pixel_centers));
+    input_width       = in_attr[0]->shape->data[1];
+    input_height      = in_attr[0]->shape->data[2];
+    output_width      = out_attr[0]->shape->data[1];
+    output_height     = out_attr[0]->shape->data[2];
+    output_dims       = (vsi_size_t)out_attr[0]->shape->size;
+    output_depth      = out_attr[0]->shape->data[0];
+    output_batch      = output_dims > 3 ? out_attr[0]->shape->data[3] : 1;
+    input_depth       = in_attr[0]->shape->data[0];
+    input_width_orig  = input_width;
+    output_width_orig = output_width;
+
+    if (align_corners && output_width > 1)
+    {
+        width_scale = ((vx_float32)(input_width - 1) * 1.0f) / (vx_float32)(output_width - 1);
+    }
+    else
+    {
+        width_scale = ((vx_float32)input_width * 1.0f) / (vx_float32)output_width;
+    }
+
+    if (align_corners && output_height > 1)
+    {
+        height_scale = ((vx_float32)(input_height - 1) * 1.0f) / (vx_float32)(output_height - 1);
+    }
+    else
+    {
+        height_scale = ((vx_float32)input_height * 1.0f) / (vx_float32)output_height;
+    }
+
+    for (b = 0; b < output_batch; b ++)
+    {
+        vsi_ssize_t input_base = b * input_depth * input_width_orig * input_height;
+        vsi_ssize_t output_base = b * output_depth * output_width_orig * output_height;
+
+        for (h = 0; h < output_height; h++)
+        {
+            vx_float32 input_h = h * height_scale;
+            vsi_size_t h0;
+            vsi_size_t h1;
+
+            if (half_pixel_centers)
+            {
+                input_h = ((vx_float32)h + 0.5f) * height_scale - 0.5f;
+            }
+            else
+            {
+                input_h = h * height_scale;
+            }
+            h0 = (vsi_size_t)input_h;
+            h1 = input_h < 0 ? 0 : vsi_nn_min(h0 + 1, input_height - 1);
+            for (w = 0; w < output_width; w++)
+            {
+                vx_float32 input_w;
+                vsi_ssize_t w0;
+                vsi_ssize_t w1;
+                if (half_pixel_centers)
+                {
+                    input_w = ((vx_float32)w + 0.5f) * width_scale - 0.5f;
+                }
+                else
+                {
+                    input_w = w * width_scale;
+                }
+                w0 = (vsi_ssize_t)input_w;
+                w1 = input_w < 0 ? 0 : vsi_nn_min(w0 + 1, (vsi_ssize_t)(input_width - 1));
+
+                for (d = 0; d < output_depth; d++)
+                {
+                    index = input_base + h0 * input_width_orig * input_depth + w0 * input_depth + d;
+                    data00 = f32_in_buffer[0][index];
+                    index = input_base + h0 * input_width_orig * input_depth + w1 * input_depth + d;
+                    data01 = f32_in_buffer[0][index];
+                    index = input_base + h1 * input_width_orig * input_depth + w0 * input_depth + d;
+                    data10 = f32_in_buffer[0][index];
+                    index = input_base + h1 * input_width_orig * input_depth + w1 * input_depth + d;
+                    data11 = f32_in_buffer[0][index];
+
+                    interpolation = data00 * (1 - (input_h - h0)) * (1 - (input_w - w0)) +
+                        data10 * (input_h - h0) * (1 - (input_w - w0)) +
+                        data01 * (1 - (input_h - h0)) * (input_w - w0) +
+                        data11 * (input_h - h0) * (input_w - w0);
+                    index = output_base + h * output_width_orig * output_depth + w * output_depth + d;
+                    f32_out_buffer[0][index] = interpolation;
+                }
+            }
+        }
+    }
+
+    /* save data */
+    for (i = 0; i < _OUTPUT_NUM; i++)
+    {
+        status = vsi_nn_kernel_tensor_write_from_float( output[i], out_attr[i],
+                f32_out_buffer[i], out_elements[i] );
+        CHECK_STATUS_FAIL_GOTO( status, final );
+    }
+
+final:
+    for (i = 0; i < _INPUT_NUM; i++)
+    {
+        if (f32_in_buffer[i])
+        {
+            free(f32_in_buffer[i]);
+            f32_in_buffer[i] = NULL;
+        }
+        if (in_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &in_attr[i] );
+        }
+    }
+    for (i = 0; i < _OUTPUT_NUM; i++)
+    {
+        if (f32_out_buffer[i])
+        {
+            free(f32_out_buffer[i]);
+            f32_out_buffer[i] = NULL;
+        }
+        if (out_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &out_attr[i] );
+        }
+    }
+
+    return status;
+} /* _compute() */
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  _KERNEL_NAME );
+    kernel->info.function    = _compute;
+    kernel->info.parameters  = _resize_bilinear_nhwc_kernel_param_def;
+    kernel->info.numParams   = _cnt_of_array( _resize_bilinear_nhwc_kernel_param_def );
+    status = VSI_SUCCESS;
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_RESIZE_BILINEAR_NHWC_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
+    int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
+
+    status = _query_kernel( kernel, inputs, outputs );
+    if( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners );
+            node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM );
+            VSI_ASSERT( status == VSI_SUCCESS );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] );
+        }
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CPU( resize_bilinear_nhwc, _setup )
diff --git a/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c b/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c
index 78c7752..303b3fb 100644
--- a/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c
+++ b/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c
@@ -60,19 +60,6 @@ static vx_param_description_t _topk_kernel_param_def[] =
 };
 #define _TOPK_PARAM_NUM  _cnt_of_array( _topk_kernel_param_def )
 
-static uint32_t _max_comp_func(void* data, int32_t left, int32_t right)
-{
-    float* fdata = (float*)data;
-    if (fdata[left] >= fdata[right])
-    {
-        return TRUE;
-    }
-    else
-    {
-        return FALSE;
-    }
-}
-
 static void _find_top_k_1d
 (
     float* input,
@@ -81,37 +68,35 @@ static void _find_top_k_1d
     float* value,
     uint32_t* indices
 )
-{
-    int32_t low = 0;
-    int32_t high = input_len - 1;
-    int32_t j;
-
-    for (j = 0; j < (int32_t)input_len; j++)
+{   // Insertion sort
+    float insert_elem;
+    uint32_t position,index=0;
+    uint32_t i, j;
+    for (i = 0; i < input_len; i++)
     {
-        indices[j] = j;
-    }
-
-    j = vsi_nn_partition(input, low, high, _max_comp_func, FALSE, indices);
-
-    //part_sort
-    while (j != (int32_t)k)
-    {
-        if ((int32_t)k > j)
+        insert_elem = input[i];
+        // Record the position of the target element,
+        // and start traversing from this position forward
+        position = i;
+        index = position;
+        // Traverse forward from position to find the insertion position of the target element
+        while (position > 0 && input[position - 1] < insert_elem)
         {
-            low = j + 1;
+            // The element at position moves one position backward, index will also move with it
+            input[position] = input[position - 1];
+            indices[position] = indices[position - 1];
+            position--;
         }
-        else
+        // Insert and record the final position
+        if (position != i)
         {
-            high = j;
+            input[position] = insert_elem;
         }
-        j = vsi_nn_partition(input, low, high, _max_comp_func, FALSE, indices);
+        indices[position] = index;
     }
-    //all_sort
-    vsi_nn_partition(input, 0, k - 1, _max_comp_func, TRUE, indices);
-
-    for (j = 0; j < (int32_t)k; j++)
+    for (j = 0; j < k; j++)
     {
-        value[j] = input[indices[j]];
+        value[j] = input[j];
     }
 }
 
@@ -138,7 +123,7 @@ DEF_KERNEL_EXECUTOR(_compute)
     uint32_t  i = 0;
     int32_t  j = 0;
     int32_t  top_k = 0;
-    uint32_t block_num = 0;
+    uint32_t block_num = 1;
     uint32_t block_size = 0;
     uint32_t * indices_ptr = NULL;
 
@@ -166,7 +151,11 @@ DEF_KERNEL_EXECUTOR(_compute)
     status = vsi_nn_kernel_scalar_read_int32( param[3], &top_k );
     CHECK_STATUS_FAIL_GOTO(status, final );
 
-    block_num = (uint32_t)in_attr[0]->shape->data[1];
+    for(i = (uint32_t)in_attr[0]->shape->size - 1; i > 0; i--)
+    {
+        block_num = block_num * (uint32_t)in_attr[0]->shape->data[i];
+    }
+
     block_size = (uint32_t)in_attr[0]->shape->data[0];
     indices_ptr = (uint32_t*)malloc(block_size * sizeof(uint32_t));
     CHECK_PTR_FAIL_GOTO( indices_ptr, "Create indices buffer fail.", final );
diff --git a/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c b/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c
index 74dfc35..679a07d 100644
--- a/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c
@@ -375,7 +375,6 @@ final:
         vsi_nn_kernel_tensor_attr_release(&output_attr);
     }
     return status;
-
 } /* _add_mean_std_norm_initializer() */
 
 
@@ -433,7 +432,6 @@ static vsi_status _query_kernel
     }
 
     return status;
-
 } /* _query_kernel() */
 
 
@@ -467,7 +465,7 @@ static vsi_nn_kernel_node_t _setup
             border.constant_value.U8 = 0;
             if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
             {
-                border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+                border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]);
             }
             status  = vsi_nn_kernel_node_set_border( node, &border );
             VSI_ASSERT( status == VSI_SUCCESS );
@@ -484,10 +482,8 @@ static vsi_nn_kernel_node_t _setup
     }
 
     return node;
-
 } /* _setup() */
 
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( add_mean_std_norm, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c b/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c
index a52e76a..d74b7be 100644
--- a/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c
@@ -699,4 +699,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( conv1d_ovxlib, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
index 8888e15..feab3a0 100644
--- a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c
@@ -783,7 +783,7 @@ static vsi_nn_kernel_node_t _setup
                 if (VSI_NN_TYPE_UINT8 == inputs[0]->attr.dtype.vx_type &&
                     VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC == inputs[0]->attr.dtype.qnt_type)
                 {
-                    border.constant_value.U8 = (uint8_t)inputs[0]->attr.dtype.zero_point;
+                    border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]);
                 }
                 else
                 {
@@ -835,4 +835,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( depthwise_conv1d, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
index e78d9a9..1b99cb1 100644
--- a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c
@@ -336,10 +336,12 @@ static vx_param_description_t kernel_param_def[] =
     {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
     {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
 
 #define INPUT_FUNC_TYPE           (2)
 #define INPUT_SCALAR_ALPHA        (3)
+#define INPUT_SCALAR_BETA         (4)
 #define _CL_PARAM_NUM          _cnt_of_array(kernel_param_def)
 
 /*
@@ -368,6 +370,7 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
     float    outputScale                    = 1.0f;
     float    outputZP                       = 0;
     float    alpha                          = 0;
+    float    beta                           = 0;
     uint32_t pack_key;
 
     attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
@@ -379,6 +382,8 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
     CHECK_STATUS_FAIL_GOTO(status, final );
     status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[INPUT_SCALAR_ALPHA], &alpha);
     CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[INPUT_SCALAR_BETA], &beta);
+    CHECK_STATUS_FAIL_GOTO(status, final );
 
     out_shape  = attr[1]->shape;
 
@@ -487,6 +492,8 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
                     "uniExtractOddData_2x8", &uniExtractOddData_2x8 );
             status |= vsi_nn_kernel_gpu_add_param( node,
                     "alpha", &alpha );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "beta", &beta );
             CHECK_STATUS_FAIL_GOTO(status, final );
         }
         break;
@@ -547,6 +554,8 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer)
                     "outputZP", &outputZP );
             status |= vsi_nn_kernel_gpu_add_param( node,
                     "alpha", &alpha );
+            status |= vsi_nn_kernel_gpu_add_param( node,
+                    "beta", &beta );
 
             if (attr[1]->dtype == F16)
             {
@@ -638,6 +647,7 @@ static vsi_nn_kernel_node_t _setup
     vsi_size_t new_rank = 0;
     vsi_bool ret = FALSE;
     float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" );
+    float beta = vsi_nn_kernel_param_get_float32( params, "beta" );
 
     ret = vsi_nn_kernel_optimize_element_shape(
             inputs[0]->attr.size, inputs[0]->attr.dim_num,
@@ -670,6 +680,8 @@ static vsi_nn_kernel_node_t _setup
                     graph, I32, &unary_type );
             node_params[INPUT_SCALAR_ALPHA] = vsi_nn_kernel_scalar_create(
                     graph, F32, &alpha );
+            node_params[INPUT_SCALAR_BETA] = vsi_nn_kernel_scalar_create(
+                    graph, F32, &beta );
 
             /* Pass parameters to node. */
             status  = vsi_nn_kernel_node_pass_param( node, node_params, _CL_PARAM_NUM );
@@ -698,6 +710,11 @@ OnError:
         vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_ALPHA] );
     }
 
+    if (node_params[INPUT_SCALAR_BETA])
+    {
+        vsi_nn_kernel_scalar_release( &node_params[INPUT_SCALAR_BETA] );
+    }
+
     return node;
 } /* _setup() */
 
diff --git a/src/tim/vx/internal/src/kernel/evis/gather_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
index d49d92d..e5b12f7 100644
--- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
@@ -289,7 +289,8 @@ DEF_KERNEL_INITIALIZER(_gather_initializer)
     }
 
     shaderParam.global_scale[0]  = 16;
-    if (attr[0]->dtype == I16 || attr[0]->dtype == F16)
+    if (attr[0]->dtype == I16 || attr[0]->dtype == F16 ||
+        attr[0]->dtype == BF16 || attr[0]->dtype == U16)
     {
         shaderParam.global_scale[0]  = 8;
     }
diff --git a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
index c206930..78e9efe 100644
--- a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
@@ -51,6 +51,7 @@ __BEGIN_DECLS
 
  typedef enum
 {
+    _error = -1,
     _1D = 0,
     _2D,
     _3D
@@ -168,6 +169,10 @@ static vsi_status get_gather_nd_tensor_reshape_size
             sizes[0] = block_size;
             sizes[1] = elementCnt / block_size;
         }
+        else if(coordDim == 4)
+        {
+            newDim[0] = 3;
+        }
 
         status = VSI_SUCCESS;
     }
@@ -381,7 +386,7 @@ static vsi_status _query_kernel
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_dtype_e input0_dtype = U8;
     vsi_nn_kernel_dtype_e output_dtype = U8;
-    vsi_nn_kernel_coord_type_e coord_type = _1D;
+    vsi_nn_kernel_coord_type_e coord_type = _error;
     uint32_t key = 0;
     int i = 0;
 
@@ -404,7 +409,7 @@ static vsi_status _query_kernel
     {
         coord_type = _2D;
     }
-    else if(coord_dim == 3)
+    else if(coord_dim == 3 || coord_dim == 4)
     {
         coord_type = _3D;
     }
diff --git a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
index f70df19..2894f11 100644
--- a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c
@@ -56,6 +56,9 @@ typedef enum
 #define KERNEL_SOURCE_3    "group_normalization_i16"
 #define KERNEL_SOURCE_4    "group_normalization_f16"
 #define KERNEL_SOURCE_5    "group_normalization_u8_f16"
+#define KERNEL_SOURCE_6    "group_normalization_i8_scale"
+#define KERNEL_SOURCE_7    "group_normalization_i16_scale"
+#define KERNEL_SOURCE_8    "group_normalization_f16_scale"
 
 #define HASH_GROUPNORM_SUM_SQR_SH_KERNEL_NAME(SRC0_TYPE) \
     CVIVANTE_NAMESPACE("evis.group_norm_sumsqr_"#SRC0_TYPE)
@@ -72,6 +75,12 @@ typedef enum
 #define HASH_GROUPNORM_SH_KERNEL_2D_NAME(SRC0_TYPE, DST_TYPE) \
     CVIVANTE_NAMESPACE("evis.group_norm_"#SRC0_TYPE"to"#DST_TYPE"_2D")
 
+#define HASH_GROUPNORM_SCALE_SH_KERNEL_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.group_norm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE)
+
+#define HASH_GROUPNORM_SCALE_SH_KERNEL_2D_NAME(SRC0_TYPE, SRC1_TYPE, DST_TYPE) \
+    CVIVANTE_NAMESPACE("evis.group_norm_"#SRC0_TYPE#SRC1_TYPE"to"#DST_TYPE"_2D")
+
 // Add kernel hashtable here
 // Sum Sqr
 #define HASH_GROUPNORM_SUM_SQR_KEY(_input0_type, _output_type, _reshape_flag) \
@@ -96,19 +105,29 @@ typedef enum
         SOURCE },
 
 // normalization
-#define HASH_GROUPNORM_KEY(_input0_type, _output_type, _reshape_flag) \
-    ((_input0_type << 24) | (_output_type << 16) | (_reshape_flag << 8))
+#define HASH_GROUPNORM_KEY(_input0_type, _input1_type, _output_type, _reshape_flag) \
+    ((_input0_type << 24) | (_input1_type << 16) | (_output_type << 8) | (_reshape_flag << 4))
 
 #define TENSOR_GROUPNORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_GROUPNORM_KEY(IN0_TYPE, OUT_TYPE, 0), \
+    { HASH_GROUPNORM_KEY(IN0_TYPE, F16, OUT_TYPE, 0), \
         HASH_GROUPNORM_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \
         SOURCE },
 
 #define TENSOR_GROUPNORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \
-    { HASH_GROUPNORM_KEY(IN0_TYPE, OUT_TYPE, 1), \
+    { HASH_GROUPNORM_KEY(IN0_TYPE, F16, OUT_TYPE, 1), \
         HASH_GROUPNORM_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \
         SOURCE },
 
+#define TENSOR_GROUPNORM_SCALE_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GROUPNORM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 0), \
+        HASH_GROUPNORM_SCALE_SH_KERNEL_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
+        SOURCE },
+
+#define TENSOR_GROUPNORM_SCALE_KERNELS_2D(IN0_TYPE, IN1_TYPE, OUT_TYPE, SOURCE) \
+    { HASH_GROUPNORM_KEY(IN0_TYPE, IN1_TYPE, OUT_TYPE, 1), \
+        HASH_GROUPNORM_SCALE_SH_KERNEL_2D_NAME(IN0_TYPE, IN1_TYPE, OUT_TYPE), \
+        SOURCE },
+
 typedef struct
 {
     uint32_t key;
@@ -157,6 +176,26 @@ static const _kernel_map_type _groupnorm_kernel_map[] =
     TENSOR_GROUPNORM_KERNELS_2D( F16, F16, KERNEL_SOURCE_4 )
     TENSOR_GROUPNORM_KERNELS( F16, U8, KERNEL_SOURCE_4 )
     TENSOR_GROUPNORM_KERNELS_2D( F16, U8, KERNEL_SOURCE_4 )
+
+    TENSOR_GROUPNORM_SCALE_KERNELS( U8, F32, U8, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( U8, F32, U8, KERNEL_SOURCE_2 )
+    TENSOR_GROUPNORM_SCALE_KERNELS( U8, F32, F16, KERNEL_SOURCE_5 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( U8, F32, F16, KERNEL_SOURCE_5 )
+
+    TENSOR_GROUPNORM_SCALE_KERNELS( I8, F32, I8, KERNEL_SOURCE_6 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( I8, F32, I8, KERNEL_SOURCE_6 )
+    TENSOR_GROUPNORM_SCALE_KERNELS( I8, F32, F16, KERNEL_SOURCE_6 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( I8, F32, F16, KERNEL_SOURCE_6 )
+
+    TENSOR_GROUPNORM_SCALE_KERNELS( I16, F32, I16, KERNEL_SOURCE_7 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( I16, F32, I16, KERNEL_SOURCE_7 )
+    TENSOR_GROUPNORM_SCALE_KERNELS( I16, F32, F16, KERNEL_SOURCE_7 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( I16, F32, F16, KERNEL_SOURCE_7 )
+
+    TENSOR_GROUPNORM_SCALE_KERNELS( F16, F32, U8, KERNEL_SOURCE_8 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, U8, KERNEL_SOURCE_8 )
+    TENSOR_GROUPNORM_SCALE_KERNELS( F16, F32, F16, KERNEL_SOURCE_8 )
+    TENSOR_GROUPNORM_SCALE_KERNELS_2D( F16, F32, F16, KERNEL_SOURCE_8 )
 };
 
 /*
@@ -483,7 +522,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
         {0, 0, 0},  // localWorkSize: local group size in thread
         {0, 0, 0}}; // globalWorkSize: image size in thread
 
-    vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL};
+    vsi_nn_kernel_tensor_attr_t* attr[4] = {NULL, NULL, NULL, NULL};
     vsi_size_array_t * input_shape = NULL;
     float scaleIn = 1.0f;
     float scaleOut = 1.0f;
@@ -501,6 +540,8 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
     CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
     attr[2] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[4] );
     CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", OnError );
+    attr[3] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] );
+    CHECK_PTR_FAIL_GOTO( attr[3], "Create tensor attr buffer fail.", OnError );
 
     status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &is2D);
     CHECK_STATUS_FAIL_GOTO(status, OnError );
@@ -735,8 +776,14 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer)
 
         pack_key = _PACK_SELECT_KEY( attr[0]->dtype, attr[2]->dtype );
 
-        status  = vsi_nn_kernel_gpu_add_param(node, "height", &height);
-        status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4);
+        if (attr[3]->dtype != F32)
+        {
+            status  = vsi_nn_kernel_gpu_add_param(node, "height", &height);
+        }
+        if (!(attr[3]->dtype == F32 && (attr[0]->dtype == I16 || attr[0]->dtype == I8)))
+        {
+            status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", &UniFP16toFP32Lo4_dp4x4);
+        }
         CHECK_STATUS_FAIL_GOTO(status, OnError );
 
         switch( pack_key )
@@ -865,6 +912,11 @@ OnError:
         vsi_nn_kernel_tensor_attr_release( &attr[2] );
         attr[2] = NULL;
     }
+    if (attr[3])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[3] );
+        attr[3] = NULL;
+    }
 
     return status;
 }
@@ -1001,6 +1053,7 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_node_t tmp_node = NULL, tmp_node1 = NULL;
     vsi_nn_kernel_node_t node = NULL;
     vsi_nn_kernel_dtype_e in0_dtype = U8;
+    vsi_nn_kernel_dtype_e in2_dtype = F16;
     vsi_nn_kernel_dtype_e out_dtype = U8;
     vsi_nn_tensor_attr_t attr;
     vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL };
@@ -1040,11 +1093,12 @@ static vsi_nn_kernel_node_t _setup
     }
 
     in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    in2_dtype = vsi_nn_kernel_map_dtype( inputs[2]->attr.dtype.vx_type );
     out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
     hashkeys[SUM_SQR_INDEX]= HASH_GROUPNORM_SUM_SQR_KEY( in0_dtype, F32, is2D_flg );
     hashkeys[MEAN_VARI_INDEX]= HASH_GROUPNORM_MEAN_VARI_KEY( F32, F32 );
-    hashkey = HASH_GROUPNORM_KEY( in0_dtype, out_dtype, is2D_flg );
+    hashkey = HASH_GROUPNORM_KEY( in0_dtype, in2_dtype, out_dtype, is2D_flg );
 
     status = _query_kernel( ikernels[SUM_SQR_INDEX], hashkeys[SUM_SQR_INDEX], INTERNAL_KERNEL_SUM_SQR );
     if ( VSI_SUCCESS != status )
@@ -1104,7 +1158,7 @@ static vsi_nn_kernel_node_t _setup
             border.constant_value.U16 = 0;
             if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
             {
-                border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+                border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]);
             }
             status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) );
             CHECK_STATUS(status);
@@ -1134,7 +1188,7 @@ static vsi_nn_kernel_node_t _setup
             border.constant_value.U16 = 0;
             if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
             {
-                border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+                border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]);
             }
             status = vxSetNodeAttribute( (vx_node)tmp_node1, VX_NODE_BORDER, &border, sizeof(border) );
             CHECK_STATUS(status);
@@ -1177,7 +1231,7 @@ static vsi_nn_kernel_node_t _setup
             border.constant_value.U16 = 0;
             if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
             {
-                border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+                border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]);
             }
             status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
             CHECK_STATUS(status);
@@ -1216,4 +1270,3 @@ final:
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( group_norm, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
new file mode 100644
index 0000000..69057be
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_z_h_evis.c
@@ -0,0 +1,382 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum _grucell_nn_activation_type_e
+{
+    SIGMOID = VSI_NN_ACT_SIGMOID,
+    HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
+}grucell_nn_activation_type_e;
+
+#define _GRUCELL_ACTIVATION_Z_H_KERNEL_SOURCE      "grucell_activation_z_h"
+
+// Add kernel hashtable here
+#define GRUCELL_ACTIVATION_Z_H_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \
+        (( HSTATE_DTYPE ) | ( IN_FC_DTYPE << 6 ) | ( OUT_TYPE << 12 ) | ( REC_ACT << 18 ))
+
+#define PACK_KERNEL_MAP( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \
+    { GRUCELL_ACTIVATION_Z_H_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ), \
+      CVIVANTE_NAMESPACE("evis.grucell_activation_z_h_"#HSTATE_DTYPE"_"#IN_FC_DTYPE"to"#OUT_TYPE"_"#REC_ACT), \
+      _GRUCELL_ACTIVATION_Z_H_KERNEL_SOURCE }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _grucell_activation_z_h_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( U8,  F16, U8,  SIGMOID ),
+    PACK_KERNEL_MAP( I8,  F16, I8,  SIGMOID ),
+    PACK_KERNEL_MAP( I16, F16, I16, SIGMOID ),
+    PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ),
+};
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _grucell_activation_z_h_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _GRUCELL_ACTIVATION_Z_H_PARAM_NUM  _cnt_of_array( _grucell_activation_z_h_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_grucell_activation_z_h_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        2,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_status                   status                 = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t       hstate_out             = NULL;
+    vsi_nn_kernel_tensor_t       output                 = NULL;
+    float                        hstate_in_scale        = 1.0f;
+    float                        hstate_in_tail         = 0;
+    float                        output_scale           = 1.0f;
+    float                        output_zp              = 0;
+    uint32_t                     i                      = 0;
+    uint32_t                     pack_key               = 0;
+    vsi_nn_kernel_tensor_attr_t* input_attr[GRUCELL_ACT_Z_H_IN_CNT] = {NULL};
+    vsi_nn_kernel_tensor_attr_t* output_attr[2]                 = {NULL};
+#define _PACK_SELECT_KEY( hstate_type, fc_type, output_type )    \
+        (hstate_type | (fc_type << 8) | (output_type << 16))
+
+    output = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_Z_H_IN_CNT + GRUCELL_ACT_Z_H_OUT_OUTPUT];
+    hstate_out = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_Z_H_IN_CNT + GRUCELL_ACT_Z_H_OUT_HSTATE];
+
+    for (i = 0; i < GRUCELL_ACT_Z_H_IN_CNT; i++)
+    {
+        input_attr[i] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[i] );
+        CHECK_PTR_FAIL_GOTO( input_attr[i], "Create tensor attr buffer fail.", final );
+    }
+
+    output_attr[0] = vsi_nn_kernel_tensor_attr_create( output );
+    CHECK_PTR_FAIL_GOTO( output_attr[0], "Create tensor attr buffer fail.", final );
+    output_attr[1] = vsi_nn_kernel_tensor_attr_create( hstate_out );
+    CHECK_PTR_FAIL_GOTO( output_attr[1], "Create tensor attr buffer fail.", final );
+
+    if ( VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant )
+    {
+        int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl;
+        if (srcFixPointPos >= 0)
+            hstate_in_scale *= 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
+        else if (srcFixPointPos < 0)
+            hstate_in_scale *= (vx_float32)((int64_t)1 << -srcFixPointPos);
+    }
+    else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant )
+    {
+        hstate_in_scale = input_attr[0]->asymm.scale;
+        hstate_in_tail = -(float)input_attr[0]->asymm.zero_point * hstate_in_scale;
+    }
+
+    if ( VSI_NN_KERNEL_QUANT_DFP == output_attr[0]->quant )
+    {
+        int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl;
+        if (srcFixPointPos >= 0)
+            output_scale *= (vx_float32)((int64_t)1 << srcFixPointPos);
+        else if (srcFixPointPos < 0)
+            output_scale *= 1.0f / (vx_float32) ((int64_t)1 << -srcFixPointPos);
+    }
+    else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr[0]->quant )
+    {
+        output_scale = 1.0f / output_attr[0]->asymm.scale;
+        output_zp = (float)output_attr[0]->asymm.zero_point;
+    }
+
+    pack_key = _PACK_SELECT_KEY( input_attr[0]->dtype, input_attr[1]->dtype, output_attr[0]->dtype);
+
+    gpu_param.global_scale[0]  = 4;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_size[0]   = gpu_align_p2((output_attr[1]->shape->data[0] + gpu_param.global_scale[0] - 1)
+        / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1]   = (output_attr[1]->shape->data[1] + gpu_param.global_scale[1] - 1)
+        / gpu_param.global_scale[1];
+
+    switch (pack_key)
+    {
+    case _PACK_SELECT_KEY(F16, F16, F16):
+        {
+            gpu_dp_inst_t uniExtractH4_2x8 = {{
+                0x11111111, // TCfg
+                0x11110000, // ASelt
+                0x06040200, 0x06040200, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniF16PlusF16_0_4x4 = {{
+                0x05050505, // TCfg
+                0x04040404, // ASelt
+                0x00110000, 0x00330022, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000,
+                0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniConvertF16_0_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+                0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+
+            status  = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractH4_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniF16PlusF16_0_4x4", &uniF16PlusF16_0_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertF16_0_4x4", &uniConvertF16_0_4x4);
+            CHECK_STATUS_FAIL_GOTO(status, final );
+        }
+        break;
+    case _PACK_SELECT_KEY(U8,  F16, U8):
+    case _PACK_SELECT_KEY(I8,  F16, I8):
+    case _PACK_SELECT_KEY(I16, F16, I16):
+        {
+            gpu_dp_inst_t uniExtractInteger_2x8 = {{
+                0x33333333, // TCfg
+                0x11110000, // ASelt
+                0x03020100, 0x03020100, // ABin
+                0x00000000, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniF16PlusF16_0_4x4 = {{
+                0x05050505, // TCfg
+                0x04040404, // ASelt
+                0x00110000, 0x00330022, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000,
+                0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniConvertF16_0_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+                0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+
+            status  = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniF16PlusF16_0_4x4", &uniF16PlusF16_0_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertF16_0_4x4", &uniConvertF16_0_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "hstate_in_scale", &hstate_in_scale);
+            status |= vsi_nn_kernel_gpu_add_param(node, "hstate_in_tail", &hstate_in_tail);
+            status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
+            status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+            CHECK_STATUS_FAIL_GOTO(status, final );
+        }
+        break;
+    default:
+        break;
+    }
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+final:
+    for (i = 0; i < GRUCELL_ACT_Z_H_IN_CNT; i++)
+    {
+        if (input_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &input_attr[i] );
+        }
+    }
+    if (output_attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &output_attr[0] );
+    }
+
+    if (output_attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &output_attr[1] );
+    }
+    return status;
+} /* _grucell_activation_z_h_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t  recurrent_activation
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e hstate_dtype;
+    vsi_nn_kernel_dtype_e fc_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _grucell_activation_z_h_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _grucell_activation_z_h_kernel_map );
+    vx_param_description_t * param_def  = _grucell_activation_z_h_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _grucell_activation_z_h_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    hstate_dtype  = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_Z_H_HSTATE]->attr.dtype.vx_type );
+    fc_dtype  = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_Z_H_I_FC_Z]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]->attr.dtype.vx_type );
+
+    key = GRUCELL_ACTIVATION_Z_H_HASH_KEY( hstate_dtype, fc_dtype, out_dtype, recurrent_activation );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _grucell_activation_z_h_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_GRUCELL_ACTIVATION_Z_H_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t activation = vsi_nn_kernel_param_get_int32( params, "activation" );
+    int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" );
+
+    if( activation != VSI_NN_ACT_TANH )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, recurrent_activation );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_ACTIVATION_Z_H_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_ACTIVATION_Z_H_PARAM_NUM );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( grucell_activation_z_h, _setup )
diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c
new file mode 100644
index 0000000..5ba28e6
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_h_times_activation_r_evis.c
@@ -0,0 +1,352 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum _grucell_nn_activation_type_e
+{
+    SIGMOID = VSI_NN_ACT_SIGMOID,
+    HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
+}grucell_nn_activation_type_e;
+
+#define _GRUCELL_H_TIMES_ACTIVATION_R_KERNEL_SOURCE      "grucell_h_times_activation_r"
+
+// Add kernel hashtable here
+#define GRUCELL_H_TIMES_ACTIVATION_R_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \
+        (( HSTATE_DTYPE ) | ( IN_FC_DTYPE << 6 ) | ( OUT_TYPE << 12 ) | ( REC_ACT << 18 ))
+#define PACK_KERNEL_MAP( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \
+        { GRUCELL_H_TIMES_ACTIVATION_R_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ), \
+CVIVANTE_NAMESPACE("evis.grucell_h_times_activation_r_"#HSTATE_DTYPE"_"#IN_FC_DTYPE"to"#OUT_TYPE"_"#REC_ACT), \
+_GRUCELL_H_TIMES_ACTIVATION_R_KERNEL_SOURCE }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _grucell_h_times_activation_r_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( U8,  F16, F16, SIGMOID ),
+    PACK_KERNEL_MAP( I8,  F16, F16, SIGMOID ),
+    PACK_KERNEL_MAP( I16, F16, F16, SIGMOID ),
+    PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _grucell_h_times_activation_r_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM  _cnt_of_array( _grucell_h_times_activation_r_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_grucell_h_times_activation_r_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        2,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_status                   status                 = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t       output                 = NULL;
+    float                        hstate_in_scale        = 1.0f;
+    float                        hstate_in_tail         = 0;
+    uint32_t                     i                      = 0;
+    uint32_t                     pack_key               = 0;
+    vsi_nn_kernel_tensor_attr_t* input_attr[2]          = {NULL};
+    vsi_nn_kernel_tensor_attr_t* output_attr[1]         = {NULL};
+#define _PACK_SELECT_KEY( hstate_type, fc_type, output_type )    \
+        (hstate_type | (fc_type << 8) | (output_type << 16))
+
+    output = (vsi_nn_kernel_tensor_t)param[3];
+
+    for (i = 0; i < 2; i++)
+    {
+        input_attr[i] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[i] );
+        CHECK_PTR_FAIL_GOTO( input_attr[i], "Create tensor attr buffer fail.", final );
+    }
+
+    output_attr[0] = vsi_nn_kernel_tensor_attr_create( output );
+    CHECK_PTR_FAIL_GOTO( output_attr[0], "Create tensor attr buffer fail.", final );
+
+    if ( VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant )
+    {
+        int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl;
+        if (srcFixPointPos >= 0)
+            hstate_in_scale *= 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
+        else if (srcFixPointPos < 0)
+            hstate_in_scale *= (vx_float32)((int64_t)1 << -srcFixPointPos);
+    }
+    else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant )
+    {
+        hstate_in_scale = input_attr[0]->asymm.scale;
+        hstate_in_tail = -(float)input_attr[0]->asymm.zero_point * hstate_in_scale;
+    }
+
+    pack_key = _PACK_SELECT_KEY( input_attr[0]->dtype, input_attr[1]->dtype, output_attr[0]->dtype);
+
+    gpu_param.global_scale[0]  = 4;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_size[0]   = gpu_align_p2((input_attr[0]->shape->data[0] + gpu_param.global_scale[0] - 1)
+        / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1]   = (input_attr[0]->shape->data[1] + gpu_param.global_scale[1] - 1)
+        / gpu_param.global_scale[1];
+
+    switch (pack_key)
+    {
+    case _PACK_SELECT_KEY(F16, F16, F16):
+        {
+            gpu_dp_inst_t uniExtractH4_2x8 = {{
+                0x11111111, // TCfg
+                0x11110000, // ASelt
+                0x06040200, 0x06040200, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniF16PlusF16_0_4x4 = {{
+                0x05050505, // TCfg
+                0x04040404, // ASelt
+                0x00110000, 0x00330022, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000,
+                0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniConvertF16_0_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+                0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+
+            status  = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractH4_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniF16PlusF16_0_4x4", &uniF16PlusF16_0_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertF16_0_4x4", &uniConvertF16_0_4x4);
+            CHECK_STATUS_FAIL_GOTO(status, final );
+        }
+        break;
+    case _PACK_SELECT_KEY(U8,  F16, F16):
+    case _PACK_SELECT_KEY(I8,  F16, F16):
+    case _PACK_SELECT_KEY(I16, F16, F16):
+        {
+            gpu_dp_inst_t uniExtractH4_2x8 = {{
+                0x11111111, // TCfg
+                0x11110000, // ASelt
+                0x06040200, 0x06040200, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniF16PlusF16_0_4x4 = {{
+                0x05050505, // TCfg
+                0x04040404, // ASelt
+                0x00110000, 0x00330022, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000,
+                0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniConvertF16_0_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+                0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+
+            status  = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractH4_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniF16PlusF16_0_4x4", &uniF16PlusF16_0_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertF16_0_4x4", &uniConvertF16_0_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "hstate_in_scale", &hstate_in_scale);
+            status |= vsi_nn_kernel_gpu_add_param(node, "hstate_in_tail", &hstate_in_tail);
+            CHECK_STATUS_FAIL_GOTO(status, final );
+        }
+        break;
+    default:
+        break;
+    }
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+final:
+    for (i = 0; i < 2; i++)
+    {
+        if (input_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &input_attr[i] );
+        }
+    }
+
+    if (output_attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &output_attr[0] );
+    }
+
+    return status;
+} /* _grucell_h_times_activation_r_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t  recurrent_activation
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e hstate_dtype;
+    vsi_nn_kernel_dtype_e fc_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _grucell_h_times_activation_r_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _grucell_h_times_activation_r_kernel_map );
+    vx_param_description_t * param_def  = _grucell_h_times_activation_r_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _grucell_h_times_activation_r_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    hstate_dtype  = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_H_STATE]->attr.dtype.vx_type );
+    fc_dtype  = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_I_FC_Z]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dtype.vx_type );
+
+    key = GRUCELL_H_TIMES_ACTIVATION_R_HASH_KEY( hstate_dtype, fc_dtype, out_dtype, recurrent_activation );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _grucell_h_times_activation_r_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" );
+
+    status = _query_kernel( kernel, inputs, outputs, recurrent_activation );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_H_TIMES_ACTIVATION_R_PARAM_NUM );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( grucell_h_times_activation_r, _setup )
diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c
new file mode 100644
index 0000000..0c35aea
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/grucell_reset_after_activation_evis.c
@@ -0,0 +1,389 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "libnnext/vx_lib_nnext.h"
+
+__BEGIN_DECLS
+
+/*
+ * Define kernel meta.
+ */
+typedef enum _grucell_nn_activation_type_e
+{
+    SIGMOID = VSI_NN_ACT_SIGMOID,
+    HARD_SIGMOID = VSI_NN_ACT_HARD_SIGMOID,
+}grucell_nn_activation_type_e;
+
+#define _GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE      "grucell_reset_after_activation"
+
+// Add kernel hashtable here
+#define GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \
+        (( HSTATE_DTYPE ) | ( IN_FC_DTYPE << 6 ) | ( OUT_TYPE << 12 ) | ( REC_ACT << 18 ))
+#define PACK_KERNEL_MAP( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ) \
+        { GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( HSTATE_DTYPE, IN_FC_DTYPE, OUT_TYPE, REC_ACT ), \
+CVIVANTE_NAMESPACE("evis.grucell_reset_after_activation_"#HSTATE_DTYPE"_"#IN_FC_DTYPE"to"#OUT_TYPE"_"#REC_ACT), \
+_GRUCELL_RESET_AFTER_ACTIVATION_KERNEL_SOURCE }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _grucell_reset_after_activation_kernel_map[] =
+{
+    // Register kernel here
+    PACK_KERNEL_MAP( U8,  F16, U8,  SIGMOID ),
+    PACK_KERNEL_MAP( I8,  F16, I8,  SIGMOID ),
+    PACK_KERNEL_MAP( I16, F16, I16, SIGMOID ),
+    PACK_KERNEL_MAP( F16, F16, F16, SIGMOID ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _grucell_reset_after_activation_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM  _cnt_of_array( _grucell_reset_after_activation_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_grucell_reset_after_activation_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    gpu_param_t gpu_param = {
+        2,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_status                   status                 = VSI_FAILURE;
+    vsi_nn_kernel_tensor_t       hstate_out             = NULL;
+    vsi_nn_kernel_tensor_t       output                 = NULL;
+    float                        hstate_in_scale        = 1.0f;
+    float                        hstate_in_tail         = 0;
+    float                        output_scale           = 1.0f;
+    float                        output_zp              = 0;
+    uint32_t                     i                      = 0;
+    uint32_t                     pack_key               = 0;
+    vsi_nn_kernel_tensor_attr_t* input_attr[GRUCELL_ACT_IN_CNT] = {NULL};
+    vsi_nn_kernel_tensor_attr_t* output_attr[2]                 = {NULL};
+#define _PACK_SELECT_KEY( hstate_type, fc_type, output_type )    \
+        (hstate_type | (fc_type << 8) | (output_type << 16))
+
+
+    output = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_IN_CNT + GRUCELL_ACT_OUT_OUTPUT];
+    hstate_out = (vsi_nn_kernel_tensor_t)param[GRUCELL_ACT_IN_CNT + GRUCELL_ACT_OUT_H_STATE];
+
+    for (i = 0; i < GRUCELL_ACT_IN_CNT; i++)
+    {
+        input_attr[i] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[i] );
+        CHECK_PTR_FAIL_GOTO( input_attr[i], "Create tensor attr buffer fail.", final );
+    }
+
+    output_attr[0] = vsi_nn_kernel_tensor_attr_create( output );
+    CHECK_PTR_FAIL_GOTO( output_attr[0], "Create tensor attr buffer fail.", final );
+    output_attr[1] = vsi_nn_kernel_tensor_attr_create( hstate_out );
+    CHECK_PTR_FAIL_GOTO( output_attr[1], "Create tensor attr buffer fail.", final );
+
+    if ( VSI_NN_KERNEL_QUANT_DFP == input_attr[0]->quant )
+    {
+        int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl;
+        if (srcFixPointPos >= 0)
+            hstate_in_scale *= 1.0f / (vx_float32) ((int64_t)1 << srcFixPointPos);
+        else if (srcFixPointPos < 0)
+            hstate_in_scale *= (vx_float32)((int64_t)1 << -srcFixPointPos);
+    }
+    else if ( VSI_NN_KERNEL_QUANT_ASYMM == input_attr[0]->quant )
+    {
+        hstate_in_scale = input_attr[0]->asymm.scale;
+        hstate_in_tail = -(float)input_attr[0]->asymm.zero_point * hstate_in_scale;
+    }
+
+    if ( VSI_NN_KERNEL_QUANT_DFP == output_attr[0]->quant )
+    {
+        int8_t srcFixPointPos = (int8_t)input_attr[0]->dfp.fl;
+        if (srcFixPointPos >= 0)
+            output_scale *= (vx_float32)((int64_t)1 << srcFixPointPos);
+        else if (srcFixPointPos < 0)
+            output_scale *= 1.0f / (vx_float32) ((int64_t)1 << -srcFixPointPos);
+    }
+    else if ( VSI_NN_KERNEL_QUANT_ASYMM == output_attr[0]->quant )
+    {
+        output_scale = 1.0f / output_attr[0]->asymm.scale;
+        output_zp = (float)output_attr[0]->asymm.zero_point;
+    }
+
+    pack_key = _PACK_SELECT_KEY( input_attr[0]->dtype, input_attr[1]->dtype, output_attr[0]->dtype);
+
+    gpu_param.global_scale[0]  = 4;
+    gpu_param.global_scale[1]  = 1;
+    gpu_param.global_size[0]   = gpu_align_p2((output_attr[1]->shape->data[0] + gpu_param.global_scale[0] - 1)
+        / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1]   = (output_attr[1]->shape->data[1] + gpu_param.global_scale[1] - 1)
+        / gpu_param.global_scale[1];
+
+    switch (pack_key)
+    {
+    case _PACK_SELECT_KEY(F16, F16, F16):
+        {
+            gpu_dp_inst_t uniExtractH4_2x8 = {{
+                0x11111111, // TCfg
+                0x11110000, // ASelt
+                0x06040200, 0x06040200, // ABin
+                0x22222222, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00,
+                0x00003c00, 0x00003c00, 0x00003c00, 0x00003c00 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniF16PlusF16_0_4x4 = {{
+                0x05050505, // TCfg
+                0x04040404, // ASelt
+                0x00110000, 0x00330022, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000,
+                0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniConvertF16_0_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+                0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+
+            status  = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractH4_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniF16PlusF16_0_4x4", &uniF16PlusF16_0_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertF16_0_4x4", &uniConvertF16_0_4x4);
+            CHECK_STATUS_FAIL_GOTO(status, final );
+        }
+        break;
+    case _PACK_SELECT_KEY(U8,  F16, U8):
+    case _PACK_SELECT_KEY(I8,  F16, I8):
+    case _PACK_SELECT_KEY(I16, F16, I16):
+        {
+            gpu_dp_inst_t uniExtractInteger_2x8 = {{
+                0x33333333, // TCfg
+                0x11110000, // ASelt
+                0x03020100, 0x03020100, // ABin
+                0x00000000, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00002400, // AccumType, ConstantType, and PostShift
+                0x00000000, 0x00000000, 0x00000000, 0x00000000,
+                0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniF16PlusF16_0_4x4 = {{
+                0x05050505, // TCfg
+                0x04040404, // ASelt
+                0x00110000, 0x00330022, // ABin
+                0x0a0a0a0a, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000,
+                0x3c003c00, 0x00000000, 0x3c003c00, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+            gpu_dp_inst_t uniConvertF16_0_4x4 = {{
+                0x01010101, // TCfg
+                0x00000000, // ASelt
+                0x00010000, 0x00030002, // ABin
+                0x02020202, // BSelt
+                0x00000000, 0x00000000, // BBin
+                0x00000100, // AccumType, ConstantType, and PostShift
+                0x00003c00, 0x00000000, 0x00003c00, 0x00000000,
+                0x00003c00, 0x00000000, 0x00003c00, 0x00000000 // Constant
+            }, GPU_DP_TYPE_16};
+
+            status  = vsi_nn_kernel_gpu_add_param(node, "uniExtract8Data_2x8", &uniExtractInteger_2x8);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniF16PlusF16_0_4x4", &uniF16PlusF16_0_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertF16_0_4x4", &uniConvertF16_0_4x4);
+            status |= vsi_nn_kernel_gpu_add_param(node, "hstate_in_scale", &hstate_in_scale);
+            status |= vsi_nn_kernel_gpu_add_param(node, "hstate_in_tail", &hstate_in_tail);
+            status |= vsi_nn_kernel_gpu_add_param(node, "output_scale", &output_scale);
+            status |= vsi_nn_kernel_gpu_add_param(node, "output_zp", &output_zp);
+            CHECK_STATUS_FAIL_GOTO(status, final );
+        }
+        break;
+    default:
+        break;
+    }
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+final:
+    for (i = 0; i < GRUCELL_ACT_IN_CNT; i++)
+    {
+        if (input_attr[i])
+        {
+            vsi_nn_kernel_tensor_attr_release( &input_attr[i] );
+        }
+    }
+    if (output_attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &output_attr[0] );
+    }
+
+    if (output_attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &output_attr[1] );
+    }
+    return status;
+} /* _grucell_reset_after_activation_initializer() */
+
+
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t  recurrent_activation
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e hstate_dtype;
+    vsi_nn_kernel_dtype_e fc_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _grucell_reset_after_activation_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _grucell_reset_after_activation_kernel_map );
+    vx_param_description_t * param_def  = _grucell_reset_after_activation_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _grucell_reset_after_activation_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    hstate_dtype  = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_H_STATE]->attr.dtype.vx_type );
+    fc_dtype  = vsi_nn_kernel_map_dtype( inputs[GRUCELL_ACT_I_FC_Z]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dtype.vx_type );
+
+    key = GRUCELL_RESET_AFTER_ACTIVATION_HASH_KEY( hstate_dtype, fc_dtype, out_dtype, recurrent_activation );
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _grucell_reset_after_activation_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    int32_t activation = vsi_nn_kernel_param_get_int32( params, "activation" );
+    int32_t recurrent_activation = vsi_nn_kernel_param_get_int32( params, "recurrent_activation" );
+
+    if( activation != VSI_NN_ACT_TANH )
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, recurrent_activation );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM,
+                    inputs, input_num, outputs, output_num );
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _GRUCELL_RESET_AFTER_ACTIVATION_PARAM_NUM );
+        }
+    }
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( grucell_reset_after_activation, _setup )
diff --git a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
index a01c9f4..9ddc0bf 100644
--- a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c
@@ -1077,7 +1077,8 @@ static vsi_nn_kernel_node_t _setup
     attr.vtl = TRUE;
     attr.size[0] = ((shape[0] + 255) / 256) * 4;
     if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT16
-        || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16)
+        || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT16
+        || inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16)
     {
         attr.size[0] = ((shape[0] + 127) / 128) * 4;
     }
@@ -1137,7 +1138,7 @@ static vsi_nn_kernel_node_t _setup
                 border.constant_value.U16 = 0;
                 if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
                 {
-                    border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+                    border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]);
                 }
                 status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) );
                 CHECK_STATUS(status);
@@ -1200,7 +1201,7 @@ static vsi_nn_kernel_node_t _setup
                 border.constant_value.U16 = 0;
                 if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
                 {
-                    border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+                    border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]);
                 }
                 status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
                 CHECK_STATUS(status);
@@ -1244,4 +1245,3 @@ final:
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( instance_norm, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c
index 2943617..c7326d4 100644
--- a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c
@@ -544,7 +544,7 @@ static vsi_nn_kernel_node_t _setup
             border.constant_value.U8 = 0;
             if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
             {
-                border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+                border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]);
             }
             status  = vsi_nn_kernel_node_set_border( node, &border );
             VSI_ASSERT( status == VSI_SUCCESS );
diff --git a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
index 1de96db..e6ecaa5 100644
--- a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
@@ -1305,7 +1305,7 @@ static vsi_nn_kernel_node_t _setup_wh
                 border.constant_value.U16 = 0;
                 if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
                 {
-                    border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+                    border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]);
                 }
                 status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) );
                 CHECK_STATUS(status);
@@ -1337,7 +1337,7 @@ static vsi_nn_kernel_node_t _setup_wh
                 border.constant_value.U16 = 0;
                 if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
                 {
-                    border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+                    border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]);
                 }
                 status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
                 CHECK_STATUS(status);
@@ -1500,7 +1500,7 @@ static vsi_nn_kernel_node_t _setup
             border.constant_value.U16 = 0;
             if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
             {
-                border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+                border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]);
             }
             status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
             CHECK_STATUS(status);
diff --git a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
index 3a1eb37..f368c97 100644
--- a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c
@@ -1141,7 +1141,7 @@ static vsi_nn_kernel_node_t _setup
                 border.constant_value.U32 = 0;
                 if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
                 {
-                    border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+                    border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]);
                 }
                 if (K % 4 == 0 && N % 4 == 0)
                 {
diff --git a/src/tim/vx/internal/src/kernel/evis/moments_evis.c b/src/tim/vx/internal/src/kernel/evis/moments_evis.c
index 2379574..cf540bc 100644
--- a/src/tim/vx/internal/src/kernel/evis/moments_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/moments_evis.c
@@ -825,7 +825,7 @@ static vsi_nn_kernel_node_t _setup
             CHECK_STATUS(status);
             vsi_nn_kernel_scalar_release( &node_params[3] );
             vsi_nn_kernel_scalar_release( &node_params[4] );
-            status = set_constant_border(node, inputs[0]->attr.dtype.zero_point);
+            status = set_constant_border(node, vsi_nn_get_tensor_zero_point(inputs[0]));
             CHECK_STATUS(status);
         }
     }
@@ -844,4 +844,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( moments, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c
index dc478f9..2201205 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c
@@ -46,14 +46,20 @@ __BEGIN_DECLS
 #define VX_KERNEL_NAME_PRE_PROCESS_GRAY_COPY_U8TOI8       CVIVANTE_NAMESPACE("evis.pre_process_gray_copy_U8toI8")
 #define VX_KERNEL_NAME_PRE_PROCESS_GRAY_COPY_U8TOI16      CVIVANTE_NAMESPACE("evis.pre_process_gray_copy_U8toI16")
 #define VX_KERNEL_NAME_PRE_PROCESS_GRAY_COPY_U8TOF16      CVIVANTE_NAMESPACE("evis.pre_process_gray_copy_U8toF16")
+#define VX_KERNEL_NAME_PRE_PROCESS_GRAY_HALF_U8TOU8       CVIVANTE_NAMESPACE("evis.pre_process_gray_half_U8toU8")
+#define VX_KERNEL_NAME_PRE_PROCESS_GRAY_FOUR_OVER_THREE_U8TOU8 \
+                                                          CVIVANTE_NAMESPACE("evis.pre_process_gray_4over3_U8toU8")
 
 #define KERNEL_SOURCE_1    "pre_process_gray",
 #define KERNEL_SOURCE_2    "pre_process_gray_copy"
+#define KERNEL_SOURCE_3    "pre_process_gray_2"
 
 typedef enum
 {
     COPY = 0,
-    SCALE
+    SCALE,
+    FOUR_OVER_THREE,
+    HALF
 } vsi_nn_gray_convert_type_e;
 
 #define HASH_PRE_PROCESS_GRAY_KEY(_input0_type, _output_type, _convert_type, _image_2d) \
@@ -70,14 +76,16 @@ static const struct {
         const char* source_name;
     } pre_process_gray_map[] =
 {
-    TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, U8,  SCALE,        KERNEL_SOURCE_1)
-    TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, I8,  SCALE,        KERNEL_SOURCE_1)
-    TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, I16, SCALE,        KERNEL_SOURCE_1)
-    TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, F16, SCALE,        KERNEL_SOURCE_1)
-    TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, U8,  COPY,         KERNEL_SOURCE_2)
-    TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, I8,  COPY,         KERNEL_SOURCE_2)
-    TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, I16, COPY,         KERNEL_SOURCE_2)
-    TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, F16, COPY,         KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, U8,  SCALE,           KERNEL_SOURCE_1)
+    TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, I8,  SCALE,           KERNEL_SOURCE_1)
+    TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, I16, SCALE,           KERNEL_SOURCE_1)
+    TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, F16, SCALE,           KERNEL_SOURCE_1)
+    TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, U8,  COPY,            KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, I8,  COPY,            KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, I16, COPY,            KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, F16, COPY,            KERNEL_SOURCE_2)
+    TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, U8,  FOUR_OVER_THREE, KERNEL_SOURCE_3)
+    TENSOR_PRE_PROCESS_GRAY_KERNELS(U8, U8,  HALF,            KERNEL_SOURCE_3)
 };
 
 static vx_param_description_t vxPreProcessGrayKernel_param_def[] =
@@ -358,14 +366,150 @@ OnError:
         attr[0] = NULL;
     }
     return status;
-} /* _pre_process_gray_copy_initializer() */
+} /* _pre_process_gray_initializer() */
+
+DEF_KERNEL_INITIALIZER(_resize_gray_initializer)
+    (
+    vsi_nn_kernel_node_t node,
+    const vsi_nn_kernel_node_param_t * param,
+    size_t param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t shaderParam = {
+        2,          // workdim
+        {0, 0, 0},  // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0},  // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0},  // localWorkSize: local group size in thread
+        {0, 0, 0}}; // globalWorkSize: image size in thread
+
+    uint32_t    width       = 0;
+    uint32_t    height      = 0;
+    vsi_bool    is_4_over_3 = 0;
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+    vsi_size_array_t * out_shape = NULL;
+
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError );
+
+    out_shape  = attr[1]->shape;
+    width      = (uint32_t)(out_shape->data[0]);
+    height     = (uint32_t)(out_shape->data[1]);
+
+    is_4_over_3 = (attr[0]->shape->data[0] * 3 == width * 4) &&
+                  (attr[0]->shape->data[1] * 3 == height * 4);
+
+    if (is_4_over_3)
+    {
+        shaderParam.global_scale[0]  = 16;
+        shaderParam.global_scale[1]  = 4;
+        shaderParam.global_size[0]   = gpu_align_p2((attr[0]->shape->data[0] + shaderParam.global_scale[0] - 1)
+            / shaderParam.global_scale[0], 4);
+        shaderParam.global_size[1]   = (attr[0]->shape->data[1] + shaderParam.global_scale[1] - 1)
+            / shaderParam.global_scale[1];
+    }
+    else
+    {
+        shaderParam.global_scale[0]  = 16;
+        shaderParam.global_scale[1]  = 2;
+        shaderParam.global_size[0]   = gpu_align_p2((attr[0]->shape->data[0] + shaderParam.global_scale[0] - 1)
+            / shaderParam.global_scale[0], 4);
+        shaderParam.global_size[1]   = (attr[0]->shape->data[1] + shaderParam.global_scale[1] - 1)
+            / shaderParam.global_scale[1];
+    }
+
+    status = vsi_nn_kernel_gpu_config( node, &shaderParam );
+    CHECK_STATUS_FAIL_GOTO(status, OnError);
+
+    if (is_4_over_3)
+    {
+        gpu_dp_inst_t uniBilinear_4over3_l00_2x8 = {{
+            0x51551551, // TCfg
+            0x00000000, // ASelt
+            0x04322100, 0xa9087665, // ABin
+            0xa2aa2aa2, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000610, // AccumType, ConstantType, and PostShift
+            0x0000ffff, 0x5555aaaa, 0xaaaa5555, 0x0000ffff,
+            0x5555aaaa, 0xaaaa5555, 0x0000ffff, 0x5555aaaa // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniBilinear_4over3_l10_2x8 = {{
+            0x00005515, // TCfg
+            0x00000000, // ASelt
+            0xfeed0cba, 0x00000000, // ABin
+            0x0000aa2a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000610, // AccumType, ConstantType, and PostShift
+            0xaaaa5555, 0x0000ffff, 0x5555aaaa, 0xaaaa5555,
+            0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniBilinear_4over3_l01_4x4 = {{
+            0x05555505, // TCfg
+            0x04505004, // ASelt
+            0x21210000, 0x00443232, // ABin
+            0x0aaaaa0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000610, // AccumType, ConstantType, and PostShift
+            0x5555aaaa, 0x00000000, 0x38e471c7, 0x1c7238e4,
+            0x71c738e4, 0x38e41c72, 0x5555aaaa, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniBilinear_4over3_l11_4x4 = {{
+            0x55055555, // TCfg
+            0x50045050, // ASelt
+            0x76766565, 0xa9a90088, // ABin
+            0xaa0aaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000610, // AccumType, ConstantType, and PostShift
+            0x38e471c7, 0x1c7238e4, 0x71c738e4, 0x38e41c72,
+            0x5555aaaa, 0x00000000, 0x38e471c7, 0x1c7238e4 // Constant
+        }, GPU_DP_TYPE_16 };
+        gpu_dp_inst_t uniBilinear_4over3_l21_4x4 = {{
+            0x55550555, // TCfg
+            0x50500450, // ASelt
+            0x00ccbaba, 0xfefeeded, // ABin
+            0xaaaa0aaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000610, // AccumType, ConstantType, and PostShift
+            0x71c738e4, 0x38e41c72, 0x5555aaaa, 0x00000000,
+            0x38e471c7, 0x1c7238e4, 0x71c738e4, 0x38e41c72 // Constant
+        }, GPU_DP_TYPE_16 };
+
+
+        status  = vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l00_2x8", &uniBilinear_4over3_l00_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l10_2x8", &uniBilinear_4over3_l10_2x8);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l01_4x4", &uniBilinear_4over3_l01_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l11_4x4", &uniBilinear_4over3_l11_4x4);
+        status |= vsi_nn_kernel_gpu_add_param(node, "uniBilinear_4over3_l21_4x4", &uniBilinear_4over3_l21_4x4);
+
+        CHECK_STATUS_FAIL_GOTO(status, OnError );
+    }
+
+OnError:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+        attr[0] = NULL;
+    }
+
+    return status;
+} /* _resize_gray_initializer() */
 
 static vsi_status _query_kernel
     (
     vsi_nn_tensor_t* const* const inputs,
     vsi_nn_tensor_t* const* const outputs,
     vsi_nn_kernel_t* kernel,
-    const vsi_nn_kernel_param_t * params
+    const vsi_nn_kernel_param_t * params,
+    vsi_bool is_no_range_change,
+    int32_t width,
+    int32_t height
     )
 {
     vsi_nn_kernel_dtype_e input0_dtype = U8;
@@ -373,40 +517,61 @@ static vsi_status _query_kernel
     vsi_nn_gray_convert_type_e convert_type = SCALE;
     vsi_status status = VSI_FAILURE;
     uint32_t key = 0;
-    int i = 0;
-    vsi_bool enable_copy  = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
+    int32_t i = 0;
+    vsi_bool is_4_over_3 = FALSE;
+    vsi_bool is_half_scale = FALSE;
+    vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" );
 
+    is_4_over_3 = (width * 3 == (int32_t)outputs[0]->attr.size[0] * 4) &&
+                  (height * 3 == (int32_t)outputs[0]->attr.size[1] * 4);
+    is_half_scale = (width == (int32_t)outputs[0]->attr.size[0] * 2) &&
+                  (height == (int32_t)outputs[0]->attr.size[1] * 2);
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    if(enable_copy)
+    if (enable_copy)
     {
         convert_type = COPY;
     }
     else
     {
-        convert_type = SCALE;
+        if (is_no_range_change && is_4_over_3)
+        {
+            convert_type = FOUR_OVER_THREE;
+        }
+        else if (is_no_range_change && is_half_scale)
+        {
+            convert_type = HALF;
+        }
+        else
+        {
+            convert_type = SCALE;
+        }
     }
 
     key = HASH_PRE_PROCESS_GRAY_KEY( input0_dtype, output_dtype, convert_type, 0 );
 
-    for( i = 0; i < _cnt_of_array(pre_process_gray_map); i ++ )
+    for ( i = 0; i < _cnt_of_array(pre_process_gray_map); i ++ )
     {
-        if( pre_process_gray_map[i].key == key )
+        if ( pre_process_gray_map[i].key == key )
         {
             break;
         }
     }
-    if( i < _cnt_of_array(pre_process_gray_map) )
+    if ( i < _cnt_of_array(pre_process_gray_map) )
     {
         snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  pre_process_gray_map[i].function_name );
         kernel->info.parameters = vxPreProcessGrayKernel_param_def;
         kernel->info.numParams = _cnt_of_array( vxPreProcessGrayKernel_param_def );
 
-        if(enable_copy)
+        if (enable_copy)
         {
             kernel->info.initialize = _pre_process_gray_copy_initializer;
         }
+        else if (convert_type == FOUR_OVER_THREE || convert_type == HALF)
+        {
+            kernel->info.initialize = _resize_gray_initializer;
+        }
         else
         {
             kernel->info.initialize = _pre_process_gray_initializer;
@@ -435,6 +600,11 @@ static vsi_nn_kernel_node_t _setup
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_GRAY_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_t node = NULL;
+    int32_t width  = vsi_nn_kernel_param_get_int32( params, "width" );
+    int32_t height = vsi_nn_kernel_param_get_int32( params, "height" );
+    float mean       = vsi_nn_kernel_param_get_float32( params, "mean" );
+    float scale      = vsi_nn_kernel_param_get_float32( params, "scale" );
+    vsi_bool is_no_range_change = FALSE;
 
     if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
                 outputs[0]->attr.dim_num ) )
@@ -442,7 +612,16 @@ static vsi_nn_kernel_node_t _setup
         return NULL;
     }
 
-    status = _query_kernel( inputs, outputs, kernel, params );
+    if (width == (int32_t)inputs[0]->attr.size[0] && height == (int32_t)inputs[0]->attr.size[1] &&
+        outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8 &&
+        outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC &&
+        (float)outputs[0]->attr.dtype.zero_point == mean &&
+        vsi_nn_abs(outputs[0]->attr.dtype.scale - scale) < 1e-8 )
+    {
+        is_no_range_change = TRUE;
+    }
+
+    status = _query_kernel( inputs, outputs, kernel, params, is_no_range_change, width, height );
     if( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
@@ -453,8 +632,6 @@ static vsi_nn_kernel_node_t _setup
             int32_t scale_y  = vsi_nn_kernel_param_get_int32( params, "scale_y" );
             int32_t left     = vsi_nn_kernel_param_get_int32( params, "left" );
             int32_t top      = vsi_nn_kernel_param_get_int32( params, "top" );
-            float mean       = vsi_nn_kernel_param_get_float32( params, "mean" );
-            float scale      = vsi_nn_kernel_param_get_float32( params, "scale" );
 
             /* Pass parameters to node. */
             vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_GRAY_PARAM_NUM,
@@ -481,4 +658,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( pre_process_gray, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c
index 39b9649..e70b58a 100644
--- a/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c
@@ -277,6 +277,15 @@ DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer)
         status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
+    else
+    {
+        inputScale              = 1.0f;
+        input_offset_asymmetric = 0;
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
+        status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
 
     if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
@@ -300,6 +309,14 @@ DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer)
         status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
+    else
+    {
+        outputScale              = 1.0f;
+        output_offset_asymmetric = 0;
+        status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
+        status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
     status  = vsi_nn_kernel_gpu_add_param( node, "axisSize", &axisSize );
     CHECK_STATUS_FAIL_GOTO(status, final );
     status = vsi_nn_kernel_gpu_config( node, &gpu_param );
diff --git a/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c
index 7ec74d5..b1149fd 100644
--- a/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c
@@ -279,6 +279,15 @@ DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer)
         status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
+    else
+    {
+        inputScale              = 1.0f;
+        input_offset_asymmetric = 0;
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
+        status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
 
     if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
@@ -302,6 +311,15 @@ DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer)
         status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
+    else
+    {
+        outputScale              = 1.0f;
+        output_offset_asymmetric = 0;
+        status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
+        status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+
     status  = vsi_nn_kernel_gpu_add_param( node, "axisSize", &axisSize );
     CHECK_STATUS_FAIL_GOTO(status, final );
     status = vsi_nn_kernel_gpu_config( node, &gpu_param );
@@ -426,4 +444,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( reducemin_internal, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c
index bbdf29e..6fd1b7d 100644
--- a/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c
@@ -368,6 +368,15 @@ DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer)
         status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
+    else
+    {
+        inputScale              = 1.0f;
+        input_offset_asymmetric = 0;
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "inputScale", &inputScale );
+        status |= vsi_nn_kernel_gpu_add_param( node, "input_offset_asymmetric", &input_offset_asymmetric );
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
 
     if( output_attr->quant == VSI_NN_KERNEL_QUANT_DFP )
     {
@@ -391,6 +400,15 @@ DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer)
         status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
         CHECK_STATUS_FAIL_GOTO(status, final );
     }
+    else
+    {
+        outputScale              = 1.0f;
+        output_offset_asymmetric = 0;
+        status  = vsi_nn_kernel_gpu_add_param( node, "outputScale", &outputScale );
+        status |= vsi_nn_kernel_gpu_add_param( node, "output_offset_asymmetric", &output_offset_asymmetric );
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+
 
     status = vsi_nn_kernel_gpu_config( node, &gpu_param );
 final:
@@ -508,4 +526,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( reduceprod_internal, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/repeat_evis.c b/src/tim/vx/internal/src/kernel/evis/repeat_evis.c
index 35d2b63..ac72b9f 100644
--- a/src/tim/vx/internal/src/kernel/evis/repeat_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/repeat_evis.c
@@ -525,7 +525,7 @@ static vsi_nn_kernel_node_t _setup
             border.constant_value.S32 = 0;
             if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
             {
-                border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+                border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]);
             }
             status = vxSetNodeAttribute( (vx_node)tmp_node, VX_NODE_BORDER, &border, sizeof(border) );
             CHECK_STATUS(status);
@@ -606,4 +606,3 @@ final:
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( repeat, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c
new file mode 100644
index 0000000..6e2e6bd
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_nhwc_evis.c
@@ -0,0 +1,520 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "utils/vsi_nn_dtype_util_prv.h"
+
+__BEGIN_DECLS
+
+#define STR(a) #a
+// Add kernel hashtable here
+#define RESIZE_BILINEAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, H_PIXEL_CENTERS, ALIGN_CORNERS, UP_SCALE ) \
+        (( IN_DTYPE ) | ( OUT_DTYPE << 8) | (H_PIXEL_CENTERS << 16) | (ALIGN_CORNERS << 17) | (UP_SCALE << 18))
+
+#define BILINEAR_NHWC_PACK_KERNEL_MAP_UP_SCALE( IN_DTYPE, OUT_DTYPE, H_PIXEL_CENTERS, ALIGN_CORNERS, UP_SCALE ) \
+        { RESIZE_BILINEAR_NHWC_HASH_KEY( IN_DTYPE, OUT_DTYPE, H_PIXEL_CENTERS, ALIGN_CORNERS, UP_SCALE ), \
+          CVIVANTE_NAMESPACE("evis.resize_bilinear_nhwc_"STR(IN_DTYPE)"to"STR(OUT_DTYPE) \
+            "_"STR(UP_SCALE)"x_upsample_half_pixel_centers"), \
+          "resize_bilinear_nhwc" }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _resize_bilinear_nhwc_kernel_map[] =
+{
+    BILINEAR_NHWC_PACK_KERNEL_MAP_UP_SCALE(U8, U8, 1, 0, 2),
+    BILINEAR_NHWC_PACK_KERNEL_MAP_UP_SCALE(U8, U8, 1, 0, 3),
+    BILINEAR_NHWC_PACK_KERNEL_MAP_UP_SCALE(U8, U8, 1, 0, 4),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _resize_bilinear_nhwc_kernel_param_def[] =
+{
+    {VX_INPUT,  VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _RESIZE_BILINEAR_NHWC_PARAM_NUM  _cnt_of_array( _resize_bilinear_nhwc_kernel_param_def )
+
+#define SCALAR_ALIGN_CORNERS         (2)
+#define SCALAR_HALF_PIXEL            (3)
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_resize_bilinear_nhwc_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0},
+        {0, 0, 0}
+        };
+    vsi_nn_kernel_tensor_attr_t * output_attr   = NULL;
+    vsi_nn_kernel_tensor_attr_t * input_attr    = NULL;
+    vsi_size_array_t             * out_shape     = NULL;
+    vsi_size_array_t             * in_shape      = NULL;
+    int32_t align_corners = 0;
+    int32_t half_pixel_centers = 0;
+    uint32_t    in_width;
+    uint32_t    in_height;
+    uint32_t    out_width;
+    uint32_t    out_height;
+    vsi_bool    is_half_pixel_centers     = FALSE;
+    vsi_bool    is_2x_up_kernel  = FALSE;
+    vsi_bool    is_3x_up_kernel  = FALSE;
+    vsi_bool    is_4x_up_kernel  = FALSE;
+
+    input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final );
+    output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &align_corners);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &half_pixel_centers);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    out_shape     = output_attr->shape;
+    in_shape      = input_attr->shape;
+
+    in_width          = (uint32_t)(in_shape->data[0]);
+    in_height         = (uint32_t)(in_shape->data[1]);
+    out_width         = (uint32_t)(out_shape->data[0]);
+    out_height        = (uint32_t)(out_shape->data[1]);
+
+    is_half_pixel_centers = (!align_corners) && (half_pixel_centers);
+
+    if (is_half_pixel_centers)
+    {
+        is_2x_up_kernel = (2 * in_width == out_width) && (2 * in_height == out_height);
+        is_3x_up_kernel = (3 * in_width == out_width) && (3 * in_height == out_height);
+        is_4x_up_kernel = (4 * in_width == out_width) && (4 * in_height == out_height);
+    }
+
+    if (is_2x_up_kernel)
+    {
+        gpu_param.global_scale[0] = 16;
+        gpu_param.global_scale[1] = 4;
+    }
+    else if (is_4x_up_kernel)
+    {
+        gpu_param.global_scale[0] = 16;
+        gpu_param.global_scale[1] = 8;
+    }
+    else if (is_3x_up_kernel)
+    {
+        gpu_param.global_scale[0] = 30;
+        gpu_param.global_scale[1] = 6;
+    }
+    else
+    {
+        gpu_param.global_scale[0] = 4;
+        gpu_param.global_scale[1] = 1;
+    }
+
+    if (is_2x_up_kernel)
+    {
+        gpu_dp_inst_t uniResize_x2_nhwc2_0_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x46194040, 0x3a48829c, 0x4882acca, 0xc4acca3a, 0xbd4e5b50, // BinSelect
+            0x00000704, // AccumType, ConstantType, and PostShift
+            0x09030301, 0x09030301, 0x03090103, 0x03090103,
+            0x09030301, 0x09030301, 0x03090103, 0x03090103 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize_x2_nhwc2_1_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x4e5b50c4, 0x7c5906bd, 0x5906cdd2, 0x48cdd27c, 0xde569d61, // BinSelect
+            0x00000704, // AccumType, ConstantType, and PostShift
+            0x09030301, 0x09030301, 0x03090103, 0x03090103,
+            0x09030301, 0x09030301, 0x03090103, 0x03090103 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize_x2_nhwc2_0_4x8", &uniResize_x2_nhwc2_0_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x2_nhwc2_1_4x8", &uniResize_x2_nhwc2_1_4x8);
+        //status |= vsi_nn_kernel_gpu_add_param( node, "out_height", &out_height);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    else if (is_3x_up_kernel)
+    {
+        gpu_dp_inst_t uniResize_x3_nhwc2_l10_4x4 = {{
+            0x05055555, // TCfg
+            0x04045050, // ASelt
+            0x31312020, 0x00330022, // ABin
+            0x0a0aaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x0000060f, // AccumType, ConstantType, and PostShift
+            0x38e41c72, 0x1c720e39, 0x38e41c72, 0x1c720e39,
+            0x2aab5556, 0x00000000, 0x2aab5556, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize_x3_nhwc2_l11_4x4 = {{
+            0x55555555, // TCfg
+            0x50505050, // ASelt
+            0x53534242, 0x53534242, // ABin
+            0xaaaaaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x0000060f, // AccumType, ConstantType, and PostShift
+            0x1c7238e4, 0x0e391c72, 0x1c7238e4, 0x0e391c72,
+            0x38e41c72, 0x1c720e39, 0x38e41c72, 0x1c720e39 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize_x3_nhwc2_l12_4x4 = {{
+            0x55550505, // TCfg
+            0x50500404, // ASelt
+            0x00550044, 0x75756464, // ABin
+            0xaaaa0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x0000060f, // AccumType, ConstantType, and PostShift
+            0x2aab5556, 0x00000000, 0x2aab5556, 0x00000000,
+            0x1c7238e4, 0x0e391c72, 0x1c7238e4, 0x0e391c72 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize_x3_nhwc2_l13_4x4 = {{
+            0x05055555, // TCfg
+            0x04045050, // ASelt
+            0x75756464, 0x00770066, // ABin
+            0x0a0aaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x0000060f, // AccumType, ConstantType, and PostShift
+            0x38e41c72, 0x1c720e39, 0x38e41c72, 0x1c720e39,
+            0x2aab5556, 0x00000000, 0x2aab5556, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize_x3_nhwc2_l14_4x4 = {{
+            0x55555555, // TCfg
+            0x50505050, // ASelt
+            0x97978686, 0x97978686, // ABin
+            0xaaaaaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x0000060f, // AccumType, ConstantType, and PostShift
+            0x1c7238e4, 0x0e391c72, 0x1c7238e4, 0x0e391c72,
+            0x38e41c72, 0x1c720e39, 0x38e41c72, 0x1c720e39 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize_x3_nhwc2_l15_4x4 = {{
+            0x55550505, // TCfg
+            0x50500404, // ASelt
+            0x00990088, 0xb9b9a8a8, // ABin
+            0xaaaa0a0a, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x0000060f, // AccumType, ConstantType, and PostShift
+            0x2aab5556, 0x00000000, 0x2aab5556, 0x00000000,
+            0x1c7238e4, 0x0e391c72, 0x1c7238e4, 0x0e391c72 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize_x3_nhwc2_l16_4x4 = {{
+            0x05055555, // TCfg
+            0x04045050, // ASelt
+            0xb9b9a8a8, 0x00bb00aa, // ABin
+            0x0a0aaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x0000060f, // AccumType, ConstantType, and PostShift
+            0x38e41c72, 0x1c720e39, 0x38e41c72, 0x1c720e39,
+            0x2aab5556, 0x00000000, 0x2aab5556, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize_x3_nhwc2_l17_4x4 = {{
+            0x55555555, // TCfg
+            0x50505050, // ASelt
+            0xdbdbcaca, 0xdbdbcaca, // ABin
+            0xaaaaaaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x0000060f, // AccumType, ConstantType, and PostShift
+            0x1c7238e4, 0x0e391c72, 0x1c7238e4, 0x0e391c72,
+            0x38e41c72, 0x1c720e39, 0x38e41c72, 0x1c720e39 // Constant
+        }, GPU_DP_TYPE_16};
+
+
+        gpu_dp_inst_t uniResize_x3_nhwc2_l00_2x8 = {{
+            0x55551155, // TCfg
+            0x00000000, // ASelt
+            0x03023120, 0x53425342, // ABin
+            0xaaaa22aa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000610, // AccumType, ConstantType, and PostShift
+            0xaaaa5555, 0xaaaa5555, 0x0000ffff, 0x0000ffff,
+            0x5555aaaa, 0x5555aaaa, 0xaaaa5555, 0xaaaa5555 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize_x3_nhwc2_l01_2x8 = {{
+            0x11555511, // TCfg
+            0x00000000, // ASelt
+            0x75640504, 0x07067564, // ABin
+            0x22aaaa22, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000610, // AccumType, ConstantType, and PostShift
+            0x0000ffff, 0x0000ffff, 0x5555aaaa, 0x5555aaaa,
+            0xaaaa5555, 0xaaaa5555, 0x0000ffff, 0x0000ffff // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize_x3_nhwc2_l02_2x8 = {{
+            0x55115555, // TCfg
+            0x00000000, // ASelt
+            0x97869786, 0xb9a80908, // ABin
+            0xaa22aaaa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000610, // AccumType, ConstantType, and PostShift
+            0x5555aaaa, 0x5555aaaa, 0xaaaa5555, 0xaaaa5555,
+            0x0000ffff, 0x0000ffff, 0x5555aaaa, 0x5555aaaa // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize_x3_nhwc2_l03_2x8 = {{
+            0x00551155, // TCfg
+            0x00000000, // ASelt
+            0x0b0ab9a8, 0x0000dbca, // ABin
+            0x00aa22aa, // BSelt
+            0x00000000, 0x00000000, // BBin
+            0x00000610, // AccumType, ConstantType, and PostShift
+            0xaaaa5555, 0xaaaa5555, 0x0000ffff, 0x0000ffff,
+            0x5555aaaa, 0x5555aaaa, 0x00000000, 0x00000000 // Constant
+        }, GPU_DP_TYPE_16};
+
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l00_2x8", &uniResize_x3_nhwc2_l00_2x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l01_2x8", &uniResize_x3_nhwc2_l01_2x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l02_2x8", &uniResize_x3_nhwc2_l02_2x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l03_2x8", &uniResize_x3_nhwc2_l03_2x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l10_4x4", &uniResize_x3_nhwc2_l10_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l11_4x4", &uniResize_x3_nhwc2_l11_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l12_4x4", &uniResize_x3_nhwc2_l12_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l13_4x4", &uniResize_x3_nhwc2_l13_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l14_4x4", &uniResize_x3_nhwc2_l14_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l15_4x4", &uniResize_x3_nhwc2_l15_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l16_4x4", &uniResize_x3_nhwc2_l16_4x4);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x3_nhwc2_l17_4x4", &uniResize_x3_nhwc2_l17_4x4);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    else if (is_4x_up_kernel)
+    {
+        gpu_dp_inst_t uniResize_x4_nhwc2_l00_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x46194040, 0x1940409c, 0x48829c46, 0x82acca3a, 0xacca3a48, // BinSelect
+            0x00000706, // AccumType, ConstantType, and PostShift
+            0x190f0f09, 0x190f0f09, 0x23051503, 0x23051503,
+            0x05230315, 0x05230315, 0x0f19090f, 0x0f19090f // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize_x4_nhwc2_l01_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0xca3a4882, 0x3a4882ac, 0x50c4acca, 0xc4bd4e5b, 0xbd4e5b50, // BinSelect
+            0x00000706, // AccumType, ConstantType, and PostShift
+            0x190f0f09, 0x190f0f09, 0x23051503, 0x23051503,
+            0x05230315, 0x05230315, 0x0f19090f, 0x0f19090f // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize_x4_nhwc2_l10_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0x46194040, 0x1940409c, 0x48829c46, 0x82acca3a, 0xacca3a48, // BinSelect
+            0x00000706, // AccumType, ConstantType, and PostShift
+            0x23150503, 0x23150503, 0x31070701, 0x31070701,
+            0x07310107, 0x07310107, 0x15230305, 0x15230305 // Constant
+        }, GPU_DP_TYPE_16};
+        gpu_dp_inst_t uniResize_x4_nhwc2_l11_4x8 = {{
+            0x55555555, 0x55555555, // TCfg
+            0xca3a4882, 0x3a4882ac, 0x50c4acca, 0xc4bd4e5b, 0xbd4e5b50, // BinSelect
+            0x00000706, // AccumType, ConstantType, and PostShift
+            0x23150503, 0x23150503, 0x31070701, 0x31070701,
+            0x07310107, 0x07310107, 0x15230305, 0x15230305 // Constant
+        }, GPU_DP_TYPE_16};
+
+        status  = vsi_nn_kernel_gpu_add_param( node, "uniResize_x4_nhwc2_l00_4x8", &uniResize_x4_nhwc2_l00_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x4_nhwc2_l01_4x8", &uniResize_x4_nhwc2_l01_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x4_nhwc2_l10_4x8", &uniResize_x4_nhwc2_l10_4x8);
+        status |= vsi_nn_kernel_gpu_add_param( node, "uniResize_x4_nhwc2_l11_4x8", &uniResize_x4_nhwc2_l11_4x8);
+        CHECK_STATUS_FAIL_GOTO(status, final );
+    }
+    else
+    {
+        VSILOGE("input or output's format is not support");
+        status = VSI_FAILURE;
+        goto final;
+    }
+
+    gpu_param.global_size[0]   = gpu_align_p2((out_width  + \
+        gpu_param.global_scale[0] - 1) / gpu_param.global_scale[0], 4);
+    gpu_param.global_size[1]   = (out_height  + \
+        gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1];
+    gpu_param.dim              = 2;
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+final:
+    if (input_attr) vsi_nn_kernel_tensor_attr_release( &input_attr );
+    if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr );
+
+    return status;
+} /* _resize_bilinear_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t align_corners,
+    int32_t half_pixel_centers,
+    uint32_t  up_scale
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _resize_bilinear_nhwc_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _resize_bilinear_nhwc_kernel_map );
+    vx_param_description_t * param_def  = _resize_bilinear_nhwc_kernel_param_def;
+    size_t param_def_size               = _cnt_of_array( _resize_bilinear_nhwc_kernel_param_def );
+    vx_kernel_initialize_f  initializer = _resize_bilinear_nhwc_initializer;
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    in_dtype = in_dtype == I8 ? U8 : in_dtype;
+    out_dtype = out_dtype == I8 ? U8 : out_dtype;
+
+    key = RESIZE_BILINEAR_NHWC_HASH_KEY( in_dtype, out_dtype, half_pixel_centers, align_corners, up_scale );
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+
+    if ( i < kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = (uint32_t)param_def_size;
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "vsi_nn_kernel_header",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_RESIZE_BILINEAR_NHWC_PARAM_NUM] = {NULL};
+    vsi_nn_kernel_node_t node   = NULL;
+    int32_t align_corners       = vsi_nn_kernel_param_get_int32( params, "align_corners" );
+    int32_t half_pixel_centers  = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" );
+    vsi_bool is_same_type       = vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype);
+    vsi_size_t depth            = inputs[0]->attr.size[0];
+    float scale_x               = (float)outputs[0]->attr.size[1] / (float)inputs[0]->attr.size[1];
+    float scale_y               = (float)outputs[0]->attr.size[2] / (float)inputs[0]->attr.size[2];
+    float up_scale              = scale_x == scale_y ? scale_x : 0;
+    uint32_t rank               = inputs[0]->attr.dim_num;
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    vsi_size_t  shapes[2][VSI_NN_MAX_DIM_NUM] = {{ 1 }};
+
+    if (!is_same_type || depth != 2 || rank < 3 ||
+        (up_scale != 2.0f && up_scale != 3.0f && up_scale != 4.0f))
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs,
+                            align_corners, half_pixel_centers, (uint32_t)up_scale);
+
+    shapes[0][0] = depth * inputs[0]->attr.size[1];
+    shapes[0][1] = inputs[0]->attr.size[2];
+    shapes[0][2] = 1;
+    shapes[0][3] = inputs[0]->attr.size[3];
+
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+        inputs[0], shapes[0], rank );
+
+    shapes[1][0] = depth * outputs[0]->attr.size[1];
+    shapes[1][1] = outputs[0]->attr.size[2];
+    shapes[1][2] = 1;
+    shapes[1][3] = outputs[0]->attr.size[3];
+
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+        outputs[0], shapes[1], rank );
+
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            /* Set inputs and outputs */
+            vsi_nn_kernel_node_pack_io( node_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM,
+                    reshape_tensors, input_num, &reshape_tensors[1], output_num );
+            node_params[SCALAR_ALIGN_CORNERS] = vsi_nn_kernel_scalar_create( graph, I32, &align_corners );
+            node_params[SCALAR_HALF_PIXEL] = vsi_nn_kernel_scalar_create( graph, I32, &half_pixel_centers );
+
+            /* Pass parameters to node. */
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _RESIZE_BILINEAR_NHWC_PARAM_NUM );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_ALIGN_CORNERS] );
+            vsi_nn_kernel_scalar_release( &node_params[SCALAR_HALF_PIXEL] );
+        }
+    }
+
+    vsi_safe_release_tensor(reshape_tensors[0]);
+    vsi_safe_release_tensor(reshape_tensors[1]);
+
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_EVIS( resize_bilinear_nhwc, _setup )
diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
index e4a497a..55af6c0 100644
--- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c
@@ -98,7 +98,9 @@ static const _kernel_map_type scatter_nd_update_map[] =
     TENSOR_SCATTER_ND_UPDATE_KERNELS(U8,   I32, U8,   F16,    KERNEL_SOURCE_1)
     TENSOR_SCATTER_ND_UPDATE_KERNELS(I8,   I32, I8,   F16,    KERNEL_SOURCE_1)
     TENSOR_SCATTER_ND_UPDATE_KERNELS(I16,  I32, I16,  F16,    KERNEL_SOURCE_1)
+    TENSOR_SCATTER_ND_UPDATE_KERNELS(F16,  I32, F16,  U8,     KERNEL_SOURCE_1)
     TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(F16, I32, F16, F16,  KERNEL_SOURCE_2)
+    TENSOR_SCATTER_ND_UPDATE_BIG_KERNELS(F16, I32, F16, U8,   KERNEL_SOURCE_2)
 };
 
 static const _kernel_map_type scatter_nd_update_reset_map[] =
@@ -766,7 +768,7 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer)
 
         status = vsi_nn_kernel_gpu_add_param( node,
                     "uniAccumulateSum_2x8", &uniAccumulateSum_2x8 );
-        if (attr[2]->quant != VSI_NN_KERNEL_QUANT_NONE)
+        if (attr[3]->quant != VSI_NN_KERNEL_QUANT_NONE)
         {
             status |= vsi_nn_kernel_gpu_add_param( node,
                 "uniU8MulAndPostShift_0_Lo_2x8", &uniU8MulAndPostShift_0_Lo_2x8 );
diff --git a/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c
index 2b79fd8..2b9d53e 100644
--- a/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c
@@ -350,7 +350,7 @@ static vsi_nn_kernel_node_t _setup
                 border.constant_value.U16 = 0;
                 if (inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_UINT8)
                 {
-                    border.constant_value.U8 = (vx_uint8)inputs[0]->attr.dtype.zero_point;
+                    border.constant_value.U8 = (uint8_t)vsi_nn_get_tensor_zero_point(inputs[0]);
                 }
                 status = vxSetNodeAttribute( (vx_node)node, VX_NODE_BORDER, &border, sizeof(border) );
                 CHECK_STATUS(status);
@@ -363,4 +363,3 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( space2depth_internal, _setup )
-
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
index d954dc0..b266a99 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
@@ -34,6 +34,7 @@
 #include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_math.h"
+#include "vsi_nn_tensor_util.h"
 
 #include "libnnext/vsi_nn_libnnext_resource.h"
 #if VSI_USE_VXC_BINARY
@@ -669,7 +670,6 @@ vsi_nn_kernel_node_t  vsi_nn_kernel_create_node
     return (vsi_nn_kernel_node_t)node;
 } /* vsi_nn_kernel_create_node() */
 
-
 vsi_status vsi_nn_kernel_node_set_border
     (vsi_nn_kernel_node_t node,
     vx_border_t* border)
@@ -709,11 +709,8 @@ vsi_nn_kernel_tensor_t vsi_nn_kernel_tensor_reshape
     vsi_size_t rank
     )
 {
-#ifdef VSI_40BIT_VA_SUPPORT
-    return (vsi_nn_kernel_tensor_t)vxReshapeTensor((vx_tensor)tensor, shape, rank);
-#else
-    return (vsi_nn_kernel_tensor_t)vxReshapeTensor((vx_tensor)tensor, (vx_int32*)shape, (vx_uint32)rank);
-#endif
+    return (vsi_nn_kernel_tensor_t)vsi_nn_safe_reshape_tensor((vx_tensor)tensor,
+            (void*)shape, (vsi_size_t)rank, sizeof(shape[0]));
 } /* vsi_nn_kernel_tensor_reshape() */
 
 void vsi_nn_kernel_tensor_release
@@ -925,6 +922,7 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
     else
     {
         vsi_nn_kernel_pirority_t default_pirority[] = {
+            { VSI_NN_KERNEL_TYPE_SP,    5 },
             { VSI_NN_KERNEL_TYPE_EVIS,  4 },
             { VSI_NN_KERNEL_TYPE_CL,    3 },
             { VSI_NN_KERNEL_TYPE_VX,    2 },
@@ -945,20 +943,28 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
         {
             type = selector.pirority[i].kernel_type;
 
-            //Skip evis and cl when disable shader
+            /* Skip evis and cl when disable shader */
             if ( (type == VSI_NN_KERNEL_TYPE_EVIS || type == VSI_NN_KERNEL_TYPE_CL)
                 && _check_shader_support(graph) == FALSE)
             {
                 continue;
             }
-            // Skip evis if not support
+            /* Skip evis if not support */
             if( type == VSI_NN_KERNEL_TYPE_EVIS
                     && graph->ctx->config.evis.ver == VSI_NN_HW_EVIS_NONE )
             {
                 continue;
             }
+
+            /* Skip StreamProcesor if not support */
+            if( type == VSI_NN_KERNEL_TYPE_SP
+                && !graph->ctx->config.support_stream_processor )
+            {
+                continue;
+            }
+
             kernel_func = backend->setup[type];
-            // Skip no kernel func
+            /* Skip no kernel func */
             if( NULL == kernel_func )
             {
                 continue;
@@ -967,7 +973,7 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
             kernel->unique_id = KERNEL_ID_OVXLIB_START + backend->unique_id;
             node = kernel_func( graph, inputs, input_num,
                     outputs, output_num, params, kernel );
-            // If node created, break the loop
+            /* If node created, break the loop */
             if( node )
             {
                 VSILOGD("Instance %s node with kernel \"%s\" ",
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c
new file mode 100644
index 0000000..b5dfa9e
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_lut.c
@@ -0,0 +1,303 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include "vsi_nn_context.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include <float.h>
+#include "vsi_nn_error.h"
+#include "utils/vsi_nn_dtype_util_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_lut.h"
+#include "utils/vsi_nn_dtype_util.h"
+
+static int32_t _comparator(const void *pa, const void *pb)
+{
+    vsi_nn_kernel_lut_t a = *(vsi_nn_kernel_lut_t *)pa;
+    vsi_nn_kernel_lut_t b = *(vsi_nn_kernel_lut_t *)pb;
+    float diff = a.index - b.index;
+
+    if ( diff > 0)
+    {
+        return 1;
+    }
+    else if ( diff < 0)
+    {
+        return -1;
+    }
+
+    return 0;
+}
+
+static float exp_eval(float val)
+{
+    return expf(val);
+}
+
+static float log_eval(float data)
+{
+    return logf(data);
+}
+
+static float elu_eval(float data, vsi_nn_kernel_lut_params *lut_param)
+{
+    float alpha = lut_param->params[0];
+    return data >=0 ? data : expf(data) * alpha - alpha;
+}
+
+static float neg_eval(float data)
+{
+    return data * -1.0f;
+}
+
+static float hsigmoid_eval(float data, vsi_nn_kernel_lut_params *lut_param)
+{
+    float alpha = lut_param->params[0];
+    float beta = lut_param->params[1];
+
+    data = (float)(alpha * data + beta);
+    data = vsi_nn_clamp(data, 0, 1);
+
+    return data;
+}
+
+static float soft_plus_eval(float data)
+{
+    return log_eval(exp_eval(data) + 1);
+}
+
+static float mish_eval(float data)
+{
+    data = (float)(data * tanh(soft_plus_eval(data)));
+
+    return data;
+}
+
+static float erf_eval(float x)
+{
+    float res = 0;
+    float tmp = x;
+    float factorial = 1; /*n!*/
+    float x_pow = x;
+    int32_t one = 1;
+    int32_t n = 1;
+
+    if (x <= -3)
+    {
+        return -1;
+    }
+    else if (x >= 3)
+    {
+        return 1;
+    }
+
+    while (vsi_abs(tmp) > 1e-5)
+    {
+        res += tmp;
+
+        factorial *= n;
+        one *= -1;
+        x_pow *= x * x;
+        tmp = one / factorial * x_pow / ( 2 * n + 1);
+
+        n ++;
+    }
+#define VSI_MUL2_RSQRTPI    (1.1283791670955126f)
+
+    res *= VSI_MUL2_RSQRTPI;
+
+    return res;
+}
+
+static float gelu_eval(float data)
+{
+    data = (float)(0.5f * data * (1 + erf_eval(data / (float)sqrt(2.0f))));
+
+    return data;
+}
+
+#define VSI_SQRT_2_RCP_PI  0.7978845834732056f
+static float hgelu_eval(float data)
+{
+    float cdf = (float)(0.5f * (1.0f + tanh((VSI_SQRT_2_RCP_PI *
+        (data + 0.044715f * data * data * data)))));
+
+    return data * cdf;
+}
+
+static float relu_keras_eval(float val, vsi_nn_kernel_lut_params *lut_param)
+{
+    float alpha = lut_param->params[0];
+    float max = lut_param->params[1];
+    float threshold = lut_param->params[2];
+
+    val = vsi_nn_min(val, max);
+    val = val < threshold ? alpha * (val - threshold) : val;
+    return val;
+}
+
+static float clip_eval(float val, vsi_nn_kernel_lut_params *lut_param)
+{
+    float min = lut_param->params[0];
+    float max = lut_param->params[1];
+
+    return vsi_nn_clamp(val, min, max);
+}
+
+static float square_eval(float x)
+{
+    return x * x;
+}
+
+static float vsi_nn_kernel_lut_activation(float data, vsi_nn_kernel_lut_params *lut_param)
+{
+    float result = 0;
+
+    switch (lut_param->act_type)
+    {
+    case VSI_NN_KERNEL_LUT_MISH:
+        result =  mish_eval(data);
+        break;
+    case VSI_NN_KERNEL_LUT_LOG:
+        result =  log_eval(data);
+        break;
+        break;
+    case VSI_NN_KERNEL_LUT_EXP:
+        result =  exp_eval(data);
+        break;
+        break;
+    case VSI_NN_KERNEL_LUT_ELU:
+        result =  elu_eval(data, lut_param);
+        break;
+        break;
+    case VSI_NN_KERNEL_LUT_NEG:
+        result =  neg_eval(data);
+        break;
+        break;
+    case VSI_NN_KERNEL_LUT_HSIGMOID:
+        result =  hsigmoid_eval(data, lut_param);
+        break;
+        break;
+    case VSI_NN_KERNEL_LUT_SOFT_PLUS:
+        result =  soft_plus_eval(data);
+        break;
+        break;
+    case VSI_NN_KERNEL_LUT_ERF:
+        result =  erf_eval(data);
+        break;
+        break;
+    case VSI_NN_KERNEL_LUT_GELU:
+        result =  gelu_eval(data);
+        break;
+        break;
+    case VSI_NN_KERNEL_LUT_HGELU:
+        result =  hgelu_eval(data);
+        break;
+    case VSI_NN_KERNEL_LUT_RELU_KERAS:
+        result =  relu_keras_eval(data, lut_param);
+        break;
+    case VSI_NN_KERNEL_LUT_CLIP:
+        result =  clip_eval(data, lut_param);
+        break;
+    case VSI_NN_KERNEL_LUT_SQUARE:
+        result =  square_eval(data);
+        break;
+    default:
+        VSILOGE( "unsupported activation function:%d", lut_param->act_type );
+        break;
+    }
+
+    return result;
+}
+
+vsi_status vsi_nn_kernel_lut
+    (
+    vx_lut index_lut,
+    vx_lut output_lut,
+    vsi_nn_kernel_lut_params *param
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    vsi_nn_kernel_lut_t *lut = NULL;
+    uint32_t i = 0;
+    float index[VSI_NN_KERNEL_LUT_MAX_SIZE] = {0};
+    float value[VSI_NN_KERNEL_LUT_MAX_SIZE] = {0};
+
+    if (index_lut == NULL || output_lut == NULL || param == NULL)
+    {
+        return VSI_FAILURE;
+    }
+
+    lut = (vsi_nn_kernel_lut_t *)calloc(VSI_NN_KERNEL_LUT_MAX_SIZE, sizeof(vsi_nn_kernel_lut_t));
+    CHECK_PTR_FAIL_GOTO( lut, "Create LUT buffer fail.", final );
+
+    for ( i = 0; i < VSI_NN_KERNEL_LUT_MAX_SIZE; i++)
+    {
+        int16_t val = (int16_t)(i << 6);
+        lut[i].index = fp16_to_fp32(val);
+        lut[i].val = vsi_nn_kernel_lut_activation(lut[i].index, param);
+    }
+
+    for (i = 0x0; i < 0x10; i++)
+    {
+        lut[i].index = 0;
+        lut[i].val = vsi_nn_kernel_lut_activation(lut[i].index, param);
+    }
+
+    for (i = 0x1F0; i < 0x200; i++)
+    {
+        lut[i].index = VSI_NN_KERNEL_LUT_FP16_MAX;
+        lut[i].val = vsi_nn_kernel_lut_activation(lut[i].index, param);
+    }
+
+    for (i = 0x3F0; i < 0x400; i++)
+    {
+        lut[i].index = VSI_NN_KERNEL_LUT_FP16_MIN;
+        lut[i].val = vsi_nn_kernel_lut_activation(lut[i].index, param);
+    }
+
+    qsort(lut, VSI_NN_KERNEL_LUT_MAX_SIZE, sizeof(vsi_nn_kernel_lut_t), _comparator);
+
+    for ( i = 0; i < VSI_NN_KERNEL_LUT_MAX_SIZE; i++)
+    {
+        index[i] = lut[i].index;
+        value[i] = lut[i].val;
+    }
+
+    status  = vxCopyLUT(index_lut, (void*)&index, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
+    status |= vxCopyLUT(output_lut, (void*)&value, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
+final:
+    vsi_nn_safe_free(lut);
+
+    return status;
+}
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
index 2447239..e3f454a 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
@@ -62,10 +62,11 @@ KERNEL_SELECTOR( depthwise_conv1d )
     vsi_size_t real_kernel = 0;
     int32_t stride = vsi_nn_kernel_param_get_int32( params, "stride" );
     vsi_nn_kernel_pirority_t pirority[] = {
-        { VSI_NN_KERNEL_TYPE_VX,    0 },
-        { VSI_NN_KERNEL_TYPE_EVIS,  3 },
-        { VSI_NN_KERNEL_TYPE_CL,    2 },
-        { VSI_NN_KERNEL_TYPE_CPU,   1 },
+        { VSI_NN_KERNEL_TYPE_VX,    1 },
+        { VSI_NN_KERNEL_TYPE_SP,    0 },
+        { VSI_NN_KERNEL_TYPE_EVIS,  4 },
+        { VSI_NN_KERNEL_TYPE_CL,    3 },
+        { VSI_NN_KERNEL_TYPE_CPU,   2 },
         };
     dilation = dilation == 0 ? 0 : dilation - 1;
     real_kernel = (kernel - 1) * dilation + kernel;
@@ -94,6 +95,7 @@ static vsi_status _select
     )
 {
     vsi_nn_kernel_pirority_t pirority[] = {
+        { VSI_NN_KERNEL_TYPE_SP,    4 },
         { VSI_NN_KERNEL_TYPE_VX,    3 },
         { VSI_NN_KERNEL_TYPE_EVIS,  2 },
         { VSI_NN_KERNEL_TYPE_CL,    1 },
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
index 15de948..a7cc925 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
@@ -37,81 +37,157 @@ typedef enum
     MEMORY_ACCESSOR_WRITE_ONLY = 1,
 } mem_accessor_e;
 
-vsi_status _copy_tensor
+vsi_status vsi_nn_kernel_copy_tensor_veiw_patch
     (
-    vsi_nn_kernel_tensor_t tensor,
+    vx_tensor tensor,
     const vsi_nn_kernel_tensor_attr_t * attr,
-    mem_accessor_e accessor,
-    void * buffer,
-    size_t buffer_size
+    void *user_ptr,
+    vsi_size_t *start,
+    vsi_size_t *end,
+    vsi_size_t *stride,
+    vsi_enum usage,
+    vsi_enum user_memory_type
     )
 {
+#define USE_OPENVX_1_2
+    size_t dim,i;
+    size_t vstart[VSI_NN_MAX_DIM_NUM],vend[VSI_NN_MAX_DIM_NUM],vstride[VSI_NN_MAX_DIM_NUM];
     vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_tensor_attr_t * internal_attr = NULL;
-    size_t rank;
-    size_t start[VSI_NN_MAX_DIM_NUM]  = { 0 };
-    size_t end[VSI_NN_MAX_DIM_NUM]    = { 0 };
-    size_t stride[VSI_NN_MAX_DIM_NUM] = { 0 };
-    vsi_size_t stride2[VSI_NN_MAX_DIM_NUM] = { 0 };
-    size_t type_bytes;
-    size_t total_bytes;
-    uint32_t i;
-
-    if( !tensor || !buffer || !buffer_size )
+    if (NULL == tensor || NULL == user_ptr || NULL == start || NULL == end || NULL == stride)
     {
         VSILOGE("Invalid parameter");
         return status;
     }
-    if( !attr )
+    dim = (size_t)attr->shape->size;
+    for (i = 0; i < dim; i++)
     {
-        internal_attr = vsi_nn_kernel_tensor_attr_create( tensor );
-        CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr fail.", final );
-        attr = internal_attr;
+        vstart[i] = (size_t)start[i];
+        vend[i] = (size_t)end[i];
+        vstride[i] = (size_t)stride[i];
     }
 
-    total_bytes = vsi_nn_kernel_tensor_attr_get_bytes( attr );
-    if( total_bytes != (vsi_size_t)buffer_size )
+#ifdef USE_OPENVX_1_2
+
+#ifdef VX_TENSOR_STRIDE_X_BITS_SUPPORT
     {
-        VSILOGE("Read buffer size mismatch %"VSI_SIZE_T_SPECIFIER" vs %"VSI_SIZE_T_SPECIFIER"",
-            total_bytes, (vsi_size_t)buffer_size);
-        goto final;
+        vx_trensor_addressing addr = NULL;
+        vx_size dim_sizes[VSI_NN_MAX_DIM_NUM], strides[VSI_NN_MAX_DIM_NUM];
+        addr = (vx_trensor_addressing)malloc(sizeof(vx_tensorpatch_addressing_t));
+        addr->num_of_dims = (vx_uint32)attr->shape->size;
+
+        for (i = 0; i < dim; i++)
+        {
+            strides[i] = (vx_size)vstride[i];
+            dim_sizes[i] = (vx_size)attr->shape->data[i];
+        }
+        addr->strides = strides;
+        addr->dim_sizes = dim_sizes;
+        if ( attr->dtype == I4 || attr->dtype == U4 )
+        {
+           addr->strides[0] = 0;
+           addr->stride_x_bits = 4;
+        }
+        status = vxCopyTensorPatch2(tensor, dim, vstart, vend, addr,sizeof(vx_tensorpatch_addressing_t),
+                                    user_ptr, usage, user_memory_type);
+        if(addr)
+        {
+            free(addr);
+            addr = NULL;
+        }
+    }
+#else
+    status = vxCopyTensorPatch(tensor, dim, vstart, vend, vstride, user_ptr, usage, user_memory_type);
+#endif
+#else
+    {
+        vx_context context = NULL;
+        vx_tensor_addressing addr = NULL;
+        size_t stride_size[VSI_NN_MAX_DIM_NUM];
+        vsi_nn_tensor_attr_t t;
+
+        memset(vstart, 0, sizeof(size_t) * VSI_NN_MAX_DIM_NUM);
+        memset(vend, 0, sizeof(size_t) * VSI_NN_MAX_DIM_NUM);
+        memset(vstride, 0, sizeof(size_t) * VSI_NN_MAX_DIM_NUM);
+        status = vsi_nn_vxGetTensorAttr(tensor, &t);
+        vsi_nn_kernel_tensor_attr_get_stride( attr, stride_size );
+        context = vxGetContext((vx_reference)tensor);
+        if( NULL == context )
+        {
+            VSILOGE("Call vxGetContext fail");
+            return status;
+        }
+        addr = vxCreateTensorAddressing( context, attr->shape->data,
+            (vx_uint32*)stride_size, attr->shape->size );
+        if( NULL == addr )
+        {
+            VSILOGE("Call vxCreateTensorAddressing fail");
+            return status;
+        }
+        status = vxCopyTensorPatch_11( tensor,
+                                       NULL,
+                                       addr,
+                                       user_ptr,
+                                       usage,
+                                       user_memory_type
+                                      );
+        vxReleaseTensorAddressing( &addr );
+        if( VSI_SUCCESS != status )
+        {
+            VSILOGE("Call vxCopyTensorPatch_11 fail");
+            return status;
+        }
+    }
+#endif
+    return status;
+} /* vsi_nn_kernel_copy_tensor_veiw_patch() */
+
+vsi_status vsi_nn_kernel_copy_tensor_patch
+    (
+    vsi_nn_kernel_tensor_t tensor,
+    const vsi_nn_kernel_tensor_attr_t * attr,
+    mem_accessor_e accessor,
+    void * user_ptr,
+    size_t buffer_size
+    )
+{
+    vsi_size_t start[VSI_NN_MAX_DIM_NUM],end[VSI_NN_MAX_DIM_NUM],stride[VSI_NN_MAX_DIM_NUM];
+    vsi_status status = VSI_FAILURE;
+    uint32_t i;
+    if (NULL == tensor || NULL == user_ptr)
+    {
+        VSILOGE("Invalid parameter");
+        return status;
     }
 
-    vsi_nn_shape_get_stride( attr->shape->data, (vsi_size_t)attr->shape->size, stride2 );
-    for( i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+    vsi_nn_kernel_tensor_attr_get_stride( attr, stride );
+    memset(start, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM);
+    for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
     {
-        stride[i] = stride2[i];
-    }
-    type_bytes = vsi_nn_kernel_dtype_get_bytes( attr->dtype );
-    rank = attr->shape->size;
-    for( i = 0; i < rank; i++ )
-    {
-        start[i]  = 0;
-        end[i]    = attr->shape->data[i];
-        stride[i] = stride[i] * type_bytes;
+        end[i] = attr->shape->data[i];
+        if ( attr->dtype != I4 && attr->dtype != U4 )
+        {
+            size_t type_bytes = vsi_nn_kernel_dtype_get_bytes( attr->dtype );
+            stride[i] = stride[i] * (vsi_size_t)type_bytes;
+        }
     }
+
     switch( accessor )
     {
         case MEMORY_ACCESSOR_READ_ONLY:
-            status = vxCopyTensorPatch( (vx_tensor)tensor, rank,
-                    start, end, stride, buffer, VX_READ_ONLY, 0);
+            status = vsi_nn_kernel_copy_tensor_veiw_patch( (vx_tensor)tensor, attr,
+                    user_ptr, start, end, stride, VX_READ_ONLY, 0);
             break;
         case MEMORY_ACCESSOR_WRITE_ONLY:
-            status = vxCopyTensorPatch( (vx_tensor)tensor, rank,
-                    start, end, stride, buffer, VX_WRITE_ONLY, 0);
+            status = vsi_nn_kernel_copy_tensor_veiw_patch( (vx_tensor)tensor, attr,
+                    user_ptr, start, end, stride, VX_WRITE_ONLY, 0);
             break;
         default:
             VSI_ASSERT( FALSE );
             break;
     }
 
-final:
-    if( internal_attr )
-    {
-        vsi_nn_kernel_tensor_attr_release( &internal_attr );
-    }
     return status;
-} /* _copy_tensor() */
+} /* vsi_nn_kernel_copy_tensor_patch() */
 
 void * vsi_nn_kernel_tensor_create_buffer
     (
@@ -123,49 +199,76 @@ void * vsi_nn_kernel_tensor_create_buffer
     vsi_status status = VSI_FAILURE;
     void * buffer = NULL;
     void * out_buffer = NULL;
+    void * tensor_buffer = NULL;
+    void * new_data = NULL;
     size_t bytes;
     size_t float_bytes;
     size_t tensor_size = 0;
     vsi_nn_kernel_tensor_attr_t * internal_attr = NULL;
 
-    if( !tensor )
+    if ( !tensor )
     {
         return NULL;
     }
 
-    if( !attr )
+    if ( !attr )
     {
         internal_attr = vsi_nn_kernel_tensor_attr_create( tensor );
         CHECK_PTR_FAIL_GOTO( internal_attr, "Create tensor attr fail.", final );
         attr = internal_attr;
     }
     bytes = vsi_nn_kernel_tensor_attr_get_bytes( attr );
-    out_buffer = malloc( bytes );
-    CHECK_PTR_FAIL_GOTO( out_buffer, "Out of memory, create buffer fail.", final );
+    tensor_buffer = malloc( bytes );
+    CHECK_PTR_FAIL_GOTO( tensor_buffer, "Out of memory, create buffer fail.", final );
 
-    status = vsi_nn_kernel_tensor_read( tensor, attr, out_buffer, bytes );
-    if( status != VSI_SUCCESS )
+    status = vsi_nn_kernel_tensor_read( tensor, attr, tensor_buffer, bytes );
+    if ( status != VSI_SUCCESS )
     {
         VSILOGE("Read tensor fail with error \"%s\".", vsi_nn_DescribeStatus(status));
-        free( out_buffer );
-        out_buffer = NULL;
+        vsi_nn_safe_free( tensor_buffer );
         goto final;
     }
 
-    if( convert_to_float && F32 != attr->dtype )
+    if ( attr->dtype == I4 || attr->dtype == U4 )
+    {
+        vsi_size_t dest_size = vsi_nn_kernel_tensor_attr_get_size( attr );
+        new_data = (uint8_t*)malloc(dest_size);
+        if ( !new_data )
+        {
+            VSILOGE("Out of memory, create buffer fail");
+            vsi_nn_safe_free( tensor_buffer );
+            goto final;
+        }
+        CHECK_PTR_FAIL_GOTO( new_data, "Out of memory, create buffer fail.", final );
+        status = vsi_nn_kernel_unpack_4bit_data(attr, (uint8_t *)tensor_buffer, (uint8_t *)new_data, attr->dtype);
+        if ( status != VSI_SUCCESS )
+        {
+            VSILOGE("Read tensor fail with error \"%s\".", vsi_nn_DescribeStatus(status));
+            vsi_nn_safe_free( tensor_buffer );
+            vsi_nn_safe_free( new_data );
+            goto final;
+        }
+        vsi_nn_safe_free( tensor_buffer );
+        out_buffer = new_data;
+    }
+    else
+    {
+        out_buffer = tensor_buffer;
+    }
+
+    if ( convert_to_float && F32 != attr->dtype )
     {
         buffer = out_buffer;
         tensor_size = vsi_nn_kernel_tensor_attr_get_size( attr );
         float_bytes = tensor_size * sizeof(float);
         out_buffer = malloc( float_bytes );
-        if( !out_buffer )
+        if ( !out_buffer )
         {
             VSILOGE("Out of memory, create float buffer fail.");
-            free( buffer );
-            buffer = NULL;
+            vsi_nn_safe_free( buffer );
             goto final;
         }
-        if( vsi_nn_kernel_tensor_attr_is_quantized( attr ) )
+        if ( vsi_nn_kernel_tensor_attr_is_quantized( attr ) )
         {
             switch( attr->quant )
             {
@@ -202,14 +305,15 @@ void * vsi_nn_kernel_tensor_create_buffer
             vsi_nn_dtype_convert_dtype_to_float( buffer, tensor_size,
                     attr->dtype, (float*)out_buffer );
         }
-        free( buffer );
+        vsi_nn_safe_free( buffer );
     }
 
 final:
-    if( internal_attr )
+    if ( internal_attr )
     {
         vsi_nn_kernel_tensor_attr_release( &internal_attr );
     }
+
     return out_buffer;
 } /* vsi_nn_kernel_tensor_create_buffer() */
 
@@ -221,7 +325,7 @@ vsi_status vsi_nn_kernel_tensor_read
     size_t out_buffer_size
     )
 {
-    return _copy_tensor( tensor, attr, MEMORY_ACCESSOR_READ_ONLY,
+    return  vsi_nn_kernel_copy_tensor_patch( tensor, attr, MEMORY_ACCESSOR_READ_ONLY,
             out_buffer, out_buffer_size );
 } /* vsi_nn_kernel_tensor_read() */
 
@@ -235,7 +339,7 @@ vsi_status vsi_nn_kernel_tensor_write
 {
     // NOTE: openvx api vxCopyTensorPatch access non-const buffer pointer,
     // so here we convert const to non-const ptr.
-    return _copy_tensor( tensor, attr, MEMORY_ACCESSOR_WRITE_ONLY,
+    return vsi_nn_kernel_copy_tensor_patch( tensor, attr, MEMORY_ACCESSOR_WRITE_ONLY,
             (void*)buffer, size );
 } /* vsi_nn_kernel_tensor_write() */
 
@@ -252,8 +356,9 @@ vsi_status vsi_nn_kernel_tensor_write_from_float
     size_t bytes;
     const void * buffer = NULL;
     void * internal_buffer = NULL;
+    void * internal_buffer0 = NULL;
     size_t tensor_size = 0;
-    if( !attr )
+    if ( !attr )
     {
         internal_attr = vsi_nn_kernel_tensor_attr_create( tensor );
         CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr fail.", final );
@@ -261,30 +366,41 @@ vsi_status vsi_nn_kernel_tensor_write_from_float
     }
     bytes = vsi_nn_kernel_tensor_attr_get_bytes( attr );
     tensor_size = vsi_nn_kernel_tensor_attr_get_size( attr );
-    if( tensor_size != size )
+    if ( tensor_size != size )
     {
         VSILOGE("Tensor and buffer size mismatch %d vs %d", tensor_size, size);
         goto final;
     }
 
+    if ( attr->dtype == I4 || attr->dtype == U4 )
+    {
+        vsi_size_t sz = 0;
+        sz = vsi_nn_kernel_tensor_attr_get_size( attr );
+        internal_buffer0 = malloc( sz );
+    }
+    else
+    {
+        internal_buffer0 = malloc( bytes );
+        internal_buffer = internal_buffer0;
+    }
+
     if( attr->dtype != F32 )
     {
-        internal_buffer = malloc( bytes );
-        CHECK_PTR_FAIL_GOTO( internal_buffer, "Create buffer fail.", final );
-        if( vsi_nn_kernel_tensor_attr_is_quantized( attr ) )
+        CHECK_PTR_FAIL_GOTO( internal_buffer0, "Create buffer fail.", final );
+        if ( vsi_nn_kernel_tensor_attr_is_quantized( attr ) )
         {
             switch( attr->quant )
             {
                 case VSI_NN_KERNEL_QUANT_DFP:
                     vsi_nn_dtype_convert_float_to_quantize_dfp(
                             float_buffer, size, attr->dtype,
-                            attr->dfp.fl, internal_buffer );
+                            attr->dfp.fl, internal_buffer0 );
                     break;
                 case VSI_NN_KERNEL_QUANT_ASYMM:
                     vsi_nn_dtype_convert_float_to_quantize_asymm(
                             float_buffer, size, attr->dtype,
                             attr->asymm.scale, attr->asymm.zero_point,
-                            internal_buffer );
+                            internal_buffer0 );
                     break;
                 case VSI_NN_KERNEL_QUANT_SYMM_PERCHANNEL:
                     vsi_nn_dtype_convert_float_to_quantize_symm_perchannel(
@@ -295,13 +411,19 @@ vsi_status vsi_nn_kernel_tensor_write_from_float
                             attr->asymm_v.zero_point->data,
                             attr->asymm_v.zero_point->size,
                             attr->asymm_v.channel_dim,
-                            internal_buffer );
+                            internal_buffer0 );
                     break;
                 default:
                     VSILOGE("Donot support quantize type %d", attr->quant);
                     VSI_ASSERT( FALSE );
                     break;
             }
+
+            if ( attr->dtype == I4 || attr->dtype == U4 )
+            {
+                internal_buffer = malloc( bytes );
+                status = vsi_nn_kernel_pack_4bit_data(attr, (uint8_t*)internal_buffer0, (uint8_t*)internal_buffer);
+            }
         }
         else
         {
@@ -316,14 +438,16 @@ vsi_status vsi_nn_kernel_tensor_write_from_float
     }
     status = vsi_nn_kernel_tensor_write( tensor, attr, buffer, bytes );
 final:
-    if( internal_attr )
+    if ( internal_attr )
     {
         vsi_nn_kernel_tensor_attr_release( &internal_attr );
     }
-    if( internal_buffer )
+    if ( attr->dtype == I4 || attr->dtype == U4 )
     {
-        free( internal_buffer );
+        vsi_nn_safe_free(internal_buffer0);
     }
+    vsi_nn_safe_free(internal_buffer);
+
     return status;
 } /* vsi_nn_kernel_tensor_write_from_float() */
 
@@ -381,6 +505,9 @@ vsi_status vsi_nn_kernel_scalar_get_dtype
         return status; \
     }
 
+DEF_KERNEL_SCALAR_FUNC( vsi_nn_kernel_scalar_read_int4,
+                        vsi_nn_kernel_scalar_write_int4,
+                        int8_t,   I4 )
 DEF_KERNEL_SCALAR_FUNC( vsi_nn_kernel_scalar_read_int8,
                         vsi_nn_kernel_scalar_write_int8,
                         int8_t,   I8 )
@@ -413,7 +540,6 @@ static void _convert_tensor_attr_to_vx_tensor_param
     memset( p, 0, sizeof( vx_tensor_create_params_t ) );
 
     p->num_of_dims = (uint32_t)attr->shape->size;
-    p->sizes = attr->shape->data;
 #define MAP_TYPE( var, src_type, dst_type ) \
     case src_type: \
         var = dst_type; \
@@ -421,6 +547,8 @@ static void _convert_tensor_attr_to_vx_tensor_param
 
     switch( attr->dtype )
     {
+        MAP_TYPE( p->data_format, U4,  VSI_NN_TYPE_UINT4 );
+        MAP_TYPE( p->data_format, I4,  VSI_NN_TYPE_INT4 );
         MAP_TYPE( p->data_format, I8,  VSI_NN_TYPE_INT8 );
         MAP_TYPE( p->data_format, I16, VSI_NN_TYPE_INT16 );
         MAP_TYPE( p->data_format, I32, VSI_NN_TYPE_INT32 );
@@ -479,8 +607,27 @@ vsi_nn_kernel_tensor_t vsi_nn_kernel_tensor_create
 {
     vsi_nn_kernel_tensor_t tensor = NULL;
     vx_tensor_create_params_t params;
+    vx_size size_vxsize[VSI_NN_MAX_DIM_NUM] = {0};
+    vx_uint32 size_u32[VSI_NN_MAX_DIM_NUM] = {0};
+    size_t i = 0;
 
     _convert_tensor_attr_to_vx_tensor_param( &params, attr );
+    //convert attr->shape->data to correct data type
+    for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+    {
+        size_vxsize[i] = -1 == attr->shape->data[i] ? -1 : (vx_size)attr->shape->data[i];
+    }
+    for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+    {
+        size_u32[i] = -1 == attr->shape->data[i] ? -1 : (vx_uint32)attr->shape->data[i];
+    }
+#ifdef VSI_40BIT_VA_SUPPORT
+    params.sizes = size_vxsize;
+    (void)size_u32;
+#else
+    params.sizes = size_u32;
+    (void)size_vxsize;
+#endif
     if( is_virtual )
     {
         tensor = (vsi_nn_kernel_tensor_t)vxCreateVirtualTensor2(
diff --git a/src/tim/vx/internal/src/kernel/vx/clip_vx.c b/src/tim/vx/internal/src/kernel/vx/clip_vx.c
deleted file mode 100644
index 3c4ab45..0000000
--- a/src/tim/vx/internal/src/kernel/vx/clip_vx.c
+++ /dev/null
@@ -1,196 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_node.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include <float.h>
-#include "utils/vsi_nn_dtype_util_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-typedef struct _sort_lut_s
-{
-    float index;
-    float val;
-} sort_lut;
-
-static float clip_eval(float val, float min, float max)
-{
-    return vsi_nn_clamp(val, min, max);
-}
-
-#ifdef VX_USER_LOOKUP_TABLE_SUPPORT
-static int32_t _lut_comparator(const void *pa, const void *pb)
-{
-    sort_lut a = *(sort_lut *)pa;
-    sort_lut b = *(sort_lut *)pb;
-    float diff = a.index - b.index;
-    if ( diff > 0 )
-    {
-        return 1;
-    }
-    else if ( diff < 0 )
-    {
-        return -1;
-    }
-
-    return 0;
-}
-
-static void _set_table_lookup(float func(float, float, float), float *index, float *value, float min, float max)
-{
-#define VSI_NN_MAX_LUT_SIZE     (1024)
-#define FLT16_MAX               (57344)
-#define FLT16_MIN               (-57344)
-    uint32_t i = 0;
-    sort_lut *lut = (sort_lut *)calloc(VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut));
-
-    for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++)
-    {
-        int16_t val = (int16_t)(i << 6);
-        lut[i].index = fp16_to_fp32(val);
-        lut[i].val = func(lut[i].index, min, max);
-    }
-
-    for (i = 0x0; i < 0x10; i++)
-    {
-        lut[i].index = 0;
-        lut[i].val = func(lut[i].index, min, max);
-    }
-
-    for (i = 0x1F0; i < 0x200; i++)
-    {
-        lut[i].index = FLT16_MAX;
-        lut[i].val = func(lut[i].index, min, max);
-    }
-
-    for (i = 0x3F0; i < 0x400; i++)
-    {
-        lut[i].index = FLT16_MIN;
-        lut[i].val = func(lut[i].index, min, max);
-    }
-
-    qsort(lut, VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut), _lut_comparator);
-
-    for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++)
-    {
-        index[i] = lut[i].index;
-        value[i] = lut[i].val;
-    }
-
-    vsi_nn_safe_free(lut);
-
-#undef VSI_NN_MAX_LUT_SIZE
-#undef FLT16_MIN
-#undef FLT16_MAX
-}
-#endif
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel,
-    float                      func(float, float, float)
-    )
-{
-#ifdef VX_USER_LOOKUP_TABLE_SUPPORT
-    vx_lut lut1 = NULL;
-    vx_lut lut2 = NULL;
-    vx_node node = NULL;
-    float   min = vsi_nn_kernel_param_get_float32( params, "min_value" );
-    float   max = vsi_nn_kernel_param_get_float32( params, "max_value" );
-    float index[1024] = {0};
-    float value[1024] = {0};
-
-    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32   ||
-         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 )
-    {
-        return NULL;
-    }
-
-    _set_table_lookup(func, index, value, min, max);
-
-    lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024);
-    lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024);
-    if( NULL == lut1 || NULL == lut2 )
-    {
-        VSILOGE("create lut object fail.");
-        goto OnError;
-    }
-
-    vxCopyLUT(lut1, (void*)&index, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyLUT(lut2, (void*)&value, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
-
-    node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t);
-    if( NULL == node )
-    {
-        VSILOGE("Call vxTensorTableLookupLayer fail.");
-        goto OnError;
-    }
-
-OnError:
-    if (lut1)
-    {
-        vxReleaseLUT(&lut1);
-        lut1 = NULL;
-    }
-    if (lut2)
-    {
-        vxReleaseLUT(&lut2);
-        lut2 = NULL;
-    }
-    return (vsi_nn_kernel_node_t)node;
-#else
-    return NULL;
-#endif
-} /* _setup() */
-
-#define REGISTER_CLIP_OPENVX_KERNEL(KERNEL_NAME, UNARY_FUNC) \
-    static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \
-        ( \
-        vsi_nn_graph_t              * graph, \
-        vsi_nn_tensor_t            ** inputs, \
-        size_t                        input_num, \
-        vsi_nn_tensor_t            ** outputs, \
-        size_t                        output_num, \
-        const vsi_nn_kernel_param_t * params, \
-        vsi_nn_kernel_t             * kernel \
-        ) \
-    { \
-        return _setup(graph, inputs, input_num, outputs, output_num, \
-                params, kernel, UNARY_FUNC); \
-    } \
-    REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup )
-
-REGISTER_CLIP_OPENVX_KERNEL( clip, clip_eval )
-
-#undef REGISTER_CLIP_OPENVX_KERNEL
diff --git a/src/tim/vx/internal/src/kernel/vx/convolutional.c b/src/tim/vx/internal/src/kernel/vx/convolutional.c
index 2bb2248..8cc0794 100644
--- a/src/tim/vx/internal/src/kernel/vx/convolutional.c
+++ b/src/tim/vx/internal/src/kernel/vx/convolutional.c
@@ -120,6 +120,67 @@ static vsi_bool _build_vx_deconv2d_param
     return TRUE;
 } /* _build_vx_deconv2d_param() */
 
+#if VX_CONV_3D_API_SUPPORT
+static vsi_bool _build_vx_conv3d_param
+    (
+    vx_nn_convolution_3d_params_t * param,
+    int32_t stride_d, int32_t stride_h, int32_t stride_w,
+    int32_t pad_d_front, int32_t pad_d_end,
+    int32_t pad_h_front, int32_t pad_h_end,
+    int32_t pad_w_front, int32_t pad_w_end,
+    int32_t dilation_d, int32_t dilation_h, int32_t dilation_w,
+    int32_t multiplier,
+    vsi_enum overflow_policy, vsi_enum rounding_policy,
+    vsi_enum down_scale_size_rounding
+    )
+{
+    VSI_ASSERT( stride_d > 0 );
+    VSI_ASSERT( stride_h > 0 );
+    VSI_ASSERT( stride_w > 0 );
+    VSI_ASSERT( pad_d_front >= 0 );
+    VSI_ASSERT( pad_d_end >= 0 );
+    VSI_ASSERT( pad_h_front >= 0 );
+    VSI_ASSERT( pad_h_end >= 0 );
+    VSI_ASSERT( pad_w_front >= 0 );
+    VSI_ASSERT( pad_w_end >= 0 );
+    VSI_ASSERT( dilation_d >= 0 );
+    VSI_ASSERT( dilation_h >= 0 );
+    VSI_ASSERT( dilation_w >= 0 );
+    VSI_ASSERT( multiplier >= 0 );
+
+    param->padding_d_front  = (uint32_t)pad_d_front;
+    param->padding_d_rear   = (uint32_t)pad_d_end;
+    param->padding_h_top    = (uint32_t)pad_h_front;
+    param->padding_h_bottom = (uint32_t)pad_h_end;
+    param->padding_w_left   = (uint32_t)pad_w_front;
+    param->padding_w_right  = (uint32_t)pad_w_end;
+
+    if( dilation_d > 0 )
+    {
+        param->dilation_d = (uint32_t)(dilation_d - 1);
+    }
+    if( dilation_h > 0 )
+    {
+        param->dilation_h = (uint32_t)(dilation_h - 1);
+    }
+    if( dilation_w > 0 )
+    {
+        param->dilation_w = (uint32_t)(dilation_w - 1);
+    }
+
+    param->overflow_policy = (vx_enum)overflow_policy;
+    param->rounding_policy = (vx_enum)rounding_policy;
+    param->down_scale_size_rounding = (vx_enum)down_scale_size_rounding;
+    param->depth_multiplier = multiplier;
+
+    param->stride_w = (uint32_t)stride_w;
+    param->stride_h = (uint32_t)stride_h;
+    param->stride_d = (uint32_t)stride_d;
+
+    return TRUE;
+} /* _build_vx_conv2d_param() */
+#endif
+
 static vx_tensor _expand_tensor_dim
     ( vx_tensor tensor, vsi_ssize_t * shape, size_t rank, vsi_ssize_t expand_dim )
 {
@@ -149,12 +210,7 @@ static vx_tensor _expand_tensor_dim
     {
         new_shape[cnt] = 1;
     }
-#ifdef VSI_40BIT_VA_SUPPORT
-    return vxReshapeTensor( tensor, (vsi_size_t*)new_shape, rank + 1 );
-#else
-    return vxReshapeTensor( tensor, (int32_t*)new_shape, (uint32_t)(rank + 1) );
-#endif
-
+    return vsi_nn_safe_reshape_tensor( tensor, (void*)new_shape, (vsi_size_t)(rank + 1) , sizeof(new_shape[0]));
 } /* _expand_tensor_dim() */
 
 
@@ -181,7 +237,6 @@ static vx_tensor _expand_tensor_dim
         vsi_nn_kernel_t             * kernel \
         )
 
-
 REGISTER_CONV_OPENVX_KERNEL( conv1d )
 {
     vx_node node = NULL;
@@ -191,11 +246,11 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d )
 
     _build_vx_conv2d_param(
             &vxparam,
-            vsi_nn_kernel_param_get_int32(params, "stride"), 1,
+            1, vsi_nn_kernel_param_get_int32(params, "stride"),
+            0, 0,
             vsi_nn_kernel_param_get_int32(params, "pad_front"),
             vsi_nn_kernel_param_get_int32(params, "pad_end"),
-            0,0,
-            vsi_nn_kernel_param_get_int32(params, "dilation"), 1,
+            1, vsi_nn_kernel_param_get_int32(params, "dilation"),
             0,
             vsi_nn_kernel_param_get_int32(params, "overflow_policy"),
             vsi_nn_kernel_param_get_int32(params, "rounding_policy"),
@@ -203,12 +258,12 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d )
             );
 
     temp_tensors[0] = _expand_tensor_dim( inputs[0]->t,
-            (vsi_ssize_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 0 );
+            (vsi_ssize_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 1 );
     CHECK_PTR_FAIL_GOTO( temp_tensors[0], "Expand input dim fail.", final );
     if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
     {
         temp_tensors[1] = _expand_tensor_dim( inputs[1]->t,
-                (vsi_ssize_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 );
+                (vsi_ssize_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, 1 );
         CHECK_PTR_FAIL_GOTO( temp_tensors[1], "Expand kernel dim fail.", final );
     }
     else
@@ -222,8 +277,9 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d )
 
         memcpy(&attr, &inputs[1]->attr, sizeof(vsi_nn_tensor_attr_t));
 
-        attr.size[0] = 1;
-        for (i = 1; i <= inputs[1]->attr.dim_num; i++)
+        attr.size[0] =  inputs[1]->attr.size[0];
+        attr.size[1] =  1;
+        for (i = 2; i <= inputs[1]->attr.dim_num; i++)
         {
             attr.size[i] = inputs[1]->attr.size[i - 1];
         }
@@ -235,7 +291,7 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d )
     }
 
     temp_tensors[2] = _expand_tensor_dim( outputs[0]->t,
-            (vsi_ssize_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, 0 );
+            (vsi_ssize_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, 1 );
     CHECK_PTR_FAIL_GOTO( temp_tensors[2], "Expand output dim fail.", final );
 
     node = vxConvolutionLayer( graph->g,
@@ -266,11 +322,11 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
 
     _build_vx_conv2d_param(
             &vxparam,
-            vsi_nn_kernel_param_get_int32(params, "stride"), 1,
+            1, vsi_nn_kernel_param_get_int32(params, "stride"),
+            0, 0,
             vsi_nn_kernel_param_get_int32(params, "pad_front"),
             vsi_nn_kernel_param_get_int32(params, "pad_end"),
-            0,0,
-            vsi_nn_kernel_param_get_int32(params, "dilation"), 1,
+            1, vsi_nn_kernel_param_get_int32(params, "dilation"),
             vsi_nn_kernel_param_get_int32(params, "multiplier"),
             vsi_nn_kernel_param_get_int32(params, "overflow_policy"),
             vsi_nn_kernel_param_get_int32(params, "rounding_policy"),
@@ -278,26 +334,23 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
             );
 
     temp_tensors[0] = _expand_tensor_dim( inputs[0]->t,
-            (vsi_ssize_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 0 );
+            (vsi_ssize_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 1 );
     CHECK_PTR_FAIL_GOTO( temp_tensors[0], "Expand input dim fail.", final );
 
     if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
     {
         vsi_size_t new_w_shape[VSI_NN_MAX_DIM_NUM] = { 0 };
         uint32_t new_w_rank = 4;
-        new_w_shape[0] = 1;
-        new_w_shape[1] = inputs[1]->attr.size[0];
+        new_w_shape[0] = inputs[1]->attr.size[0];
+        new_w_shape[1] = 1;
         new_w_shape[2] = 1;
         for (i = 1; i < (int32_t)(inputs[1]->attr.dim_num); i++)
         {
             new_w_shape[2] *= inputs[1]->attr.size[i];
         }
         new_w_shape[3] = 1;
-#ifdef VSI_40BIT_VA_SUPPORT
-        temp_tensors[1] = vxReshapeTensor( inputs[1]->t, new_w_shape, new_w_rank );
-#else
-        temp_tensors[1] = vxReshapeTensor( inputs[1]->t, (vx_int32*)new_w_shape, (vx_uint32)new_w_rank );
-#endif
+        temp_tensors[1] = vsi_nn_safe_reshape_tensor( inputs[1]->t,
+            (void*)new_w_shape, (vsi_size_t)new_w_rank, sizeof(new_w_shape[0]) );
 
         CHECK_PTR_FAIL_GOTO( temp_tensors[1], "Expand kernel dim fail.", final );
     }
@@ -312,8 +365,8 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
 
         memcpy(&attr, &inputs[1]->attr, sizeof(vsi_nn_tensor_attr_t));
 
-        attr.size[0] = 1;
-        attr.size[1] = inputs[1]->attr.size[0];
+        attr.size[0] = inputs[1]->attr.size[0];
+        attr.size[1] = 1;
         attr.size[2] = 1;
         for (i = 1; i < inputs[1]->attr.dim_num; i++)
         {
@@ -329,7 +382,7 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d )
     }
 
     temp_tensors[2] = _expand_tensor_dim( outputs[0]->t,
-            (vsi_ssize_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, 0 );
+            (vsi_ssize_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, 1 );
     CHECK_PTR_FAIL_GOTO( temp_tensors[2], "Expand output dim fail.", final );
 
     if( need_explicit_padding )
@@ -404,7 +457,7 @@ REGISTER_CONV_OPENVX_KERNEL( conv2d )
         inputs[0]->t, inputs[1]->t, inputs[2] ? inputs[2]->t : NULL,
         (vx_nn_convolution_params_t *)&vxparam,
         sizeof( vx_nn_convolution_params_ext2_t ),
-        outputs[2]->t
+        outputs[0]->t
         );
 
     return (vsi_nn_kernel_node_t)node;
@@ -435,7 +488,7 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv2d )
         inputs[0]->t, inputs[1]->t, inputs[2] ? inputs[2]->t : NULL,
         (vx_nn_convolution_params_t *)&vxparam,
         sizeof( vx_nn_convolution_params_ext2_t ),
-        outputs[2]->t
+        outputs[0]->t
         );
 
     return (vsi_nn_kernel_node_t)node;
@@ -486,4 +539,41 @@ final:
     return (vsi_nn_kernel_node_t)node;
 } /* deconvolution1d*/
 
+REGISTER_CONV_OPENVX_KERNEL( conv3d )
+{
+    vx_node node = NULL;
+#if VX_CONV_3D_API_SUPPORT
+    vx_nn_convolution_3d_params_t vxparam;
+    memset(&vxparam, 0, sizeof(vxparam));
+
+    _build_vx_conv3d_param(
+            &vxparam,
+            vsi_nn_kernel_param_get_int32(params, "stride_d"),
+            vsi_nn_kernel_param_get_int32(params, "stride_h"),
+            vsi_nn_kernel_param_get_int32(params, "stride_w"),
+            vsi_nn_kernel_param_get_int32(params, "pad_front"),
+            vsi_nn_kernel_param_get_int32(params, "pad_end"),
+            vsi_nn_kernel_param_get_int32(params, "pad_top"),
+            vsi_nn_kernel_param_get_int32(params, "pad_bottom"),
+            vsi_nn_kernel_param_get_int32(params, "pad_left"),
+            vsi_nn_kernel_param_get_int32(params, "pad_right"),
+            vsi_nn_kernel_param_get_int32(params, "dilation_d"),
+            vsi_nn_kernel_param_get_int32(params, "dilation_h"),
+            vsi_nn_kernel_param_get_int32(params, "dilation_w"),
+            vsi_nn_kernel_param_get_int32(params, "depth_multiplier"),
+            vsi_nn_kernel_param_get_int32(params, "overflow_policy"),
+            vsi_nn_kernel_param_get_int32(params, "rounding_policy"),
+            vsi_nn_kernel_param_get_int32(params, "down_scale_size_rounding")
+            );
+
+    node = vxConv3dLayer( graph->g,
+        inputs[0]->t, inputs[1]->t, inputs[2] ? inputs[2]->t : NULL,
+        &vxparam,
+        sizeof( vxparam),
+        outputs[0]->t
+        );
+#endif
+    return (vsi_nn_kernel_node_t)node;
+} /* depthwise_conv2d*/
+
 #undef REGISTER_CONV_OPENVX_KERNEL
diff --git a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
index 492f8f7..30b1257 100644
--- a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c
@@ -1,6 +1,6 @@
 /****************************************************************************
 *
-*    Copyright (c) 2020 Vivante Corporation
+*    Copyright (c) 2021 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
@@ -30,173 +30,9 @@
 #include <float.h>
 #include "utils/vsi_nn_dtype_util_prv.h"
 #include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-
-typedef struct _sort_lut_s
-{
-    float index;
-    float val;
-} sort_lut;
-
-static float exp_eval(float val, float alpha)
-{
-    return expf(val);
-}
-
-static float log_eval(float data, float alpha)
-{
-    return logf(data);
-}
-
-static float elu_eval(float data, float alpha)
-{
-    return data >=0 ? data : expf(data) * alpha - alpha;
-}
-
-static float neg_eval(float data, float alpha)
-{
-    return data * -1.0f;
-}
-
-static float hsigmoid_eval(float data, float alpha)
-{
-    data = (float)(0.2 * data + 0.5);
-    data = vsi_nn_clamp(data, 0, 1);
-
-    return data;
-}
-
-static float soft_plus_eval(float data, float alpha)
-{
-    return log_eval(exp_eval(data, alpha) + 1, alpha);
-}
-
-static float mish_eval(float data, float alpha)
-{
-    data = (float)(data * tanh(soft_plus_eval(data, alpha)));
-
-    return data;
-}
-
-static float erf_eval(float x)
-{
-    float res = 0;
-    float tmp = x;
-    float factorial = 1; /*n!*/
-    float x_pow = x;
-    int32_t one = 1;
-    int32_t n = 1;
-
-    if (x <= -3)
-    {
-        return -1;
-    }
-    else if (x >= 3)
-    {
-        return 1;
-    }
-
-    while (vsi_abs(tmp) > 1e-5)
-    {
-        res += tmp;
-
-        factorial *= n;
-        one *= -1;
-        x_pow *= x * x;
-        tmp = one / factorial * x_pow / ( 2 * n + 1);
-
-        n ++;
-    }
-#define VSI_MUL2_RSQRTPI    (1.1283791670955126f)
-
-    res *= VSI_MUL2_RSQRTPI;
-
-    return res;
-}
-
-static float gelu_eval(float data, float alpha)
-{
-    data = (float)(0.5f * data * (1 + erf_eval(data / (float)sqrt(2.0f))));
-
-    return data;
-}
-
-
-#define VSI_SQRT_2_RCP_PI  0.7978845834732056f
-static float hgelu_eval(float data, float alpha)
-{
-    float cdf = (float)(0.5f * (1.0f + tanh((VSI_SQRT_2_RCP_PI *
-        (data + 0.044715f * data * data * data)))));
-
-    return data * cdf;
-}
-
-#ifdef VX_USER_LOOKUP_TABLE_SUPPORT
-static int32_t _lut_comparator(const void *pa, const void *pb)
-{
-    sort_lut a = *(sort_lut *)pa;
-    sort_lut b = *(sort_lut *)pb;
-    float diff = a.index - b.index;
-    if ( diff > 0 )
-    {
-        return 1;
-    }
-    else if ( diff < 0 )
-    {
-        return -1;
-    }
-
-    return 0;
-}
-
-static void _set_unary_table_lookup(float func(float, float), float *index, float *value, float alpha)
-{
-#define VSI_NN_MAX_LUT_SIZE     (1024)
-#define FLT16_MAX               (57344)
-#define FLT16_MIN               (-57344)
-    uint32_t i = 0;
-    sort_lut *lut = (sort_lut *)calloc(VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut));
-
-    for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++)
-    {
-        int16_t val = (int16_t)(i << 6);
-        lut[i].index = fp16_to_fp32(val);
-        lut[i].val = func(lut[i].index, alpha);
-    }
-
-    for (i = 0x0; i < 0x10; i++)
-    {
-        lut[i].index = 0;
-        lut[i].val = func(lut[i].index, alpha);
-    }
-
-    for (i = 0x1F0; i < 0x200; i++)
-    {
-        lut[i].index = FLT16_MAX;
-        lut[i].val = func(lut[i].index, alpha);
-    }
-
-    for (i = 0x3F0; i < 0x400; i++)
-    {
-        lut[i].index = FLT16_MIN;
-        lut[i].val = func(lut[i].index, alpha);
-    }
-
-    qsort(lut, VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut), _lut_comparator);
-
-    for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++)
-    {
-        index[i] = lut[i].index;
-        value[i] = lut[i].val;
-    }
-
-    vsi_nn_safe_free(lut);
-
-#undef VSI_NN_MAX_LUT_SIZE
-#undef FLT16_MIN
-#undef FLT16_MAX
-}
-#endif
+#include "kernel/vsi_nn_kernel_lut.h"
 
 static vsi_nn_kernel_node_t _setup
     (
@@ -207,16 +43,33 @@ static vsi_nn_kernel_node_t _setup
     size_t                        output_num,
     const vsi_nn_kernel_param_t * params,
     vsi_nn_kernel_t             * kernel,
-    float                      func(float, float)
+    vsi_enum                      lut_type
     )
 {
 #ifdef VX_USER_LOOKUP_TABLE_SUPPORT
     vx_lut lut1 = NULL;
     vx_lut lut2 = NULL;
     vx_node node = NULL;
-    float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" );
-    float index[1024] = {0};
-    float value[1024] = {0};
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_lut_params lut_param;
+
+    lut_param.act_type = lut_type;
+    if (lut_type == VSI_NN_KERNEL_LUT_RELU_KERAS)
+    {
+        lut_param.params[0] = vsi_nn_kernel_param_get_float32( params, "alpha" );
+        lut_param.params[1] = vsi_nn_kernel_param_get_float32( params, "max_value" );
+        lut_param.params[2] = vsi_nn_kernel_param_get_float32( params, "threshold" );
+    }
+    else if (lut_type == VSI_NN_KERNEL_LUT_CLIP)
+    {
+        lut_param.params[0] = vsi_nn_kernel_param_get_float32( params, "min_value" );
+        lut_param.params[1] = vsi_nn_kernel_param_get_float32( params, "max_value" );
+    }
+    else if (lut_type == VSI_NN_KERNEL_LUT_ELU || lut_type == VSI_NN_KERNEL_LUT_HSIGMOID)
+    {
+        lut_param.params[0] = vsi_nn_kernel_param_get_float32( params, "alpha" );
+        lut_param.params[1] = vsi_nn_kernel_param_get_float32( params, "beta" );
+    }
 
     if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32   ||
          outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32  )
@@ -224,27 +77,25 @@ static vsi_nn_kernel_node_t _setup
         return NULL;
     }
 
-    _set_unary_table_lookup(func, index, value, alpha);
-
-    lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024);
-    lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024);
+    lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_KERNEL_LUT_MAX_SIZE);
+    lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_KERNEL_LUT_MAX_SIZE);
     if( NULL == lut1 || NULL == lut2 )
     {
         VSILOGE("create lut object fail.");
-        goto OnError;
+        goto final;
     }
 
-    vxCopyLUT(lut1, (void*)&index, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyLUT(lut2, (void*)&value, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
+    status = vsi_nn_kernel_lut(lut1, lut2, &lut_param);
+    CHECK_STATUS_FAIL_GOTO(status, final);
 
     node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t);
-    if( NULL == node )
+    if ( NULL == node )
     {
         VSILOGW("Call vxTensorTableLookupLayer fail.");
-        goto OnError;
+        goto final;
     }
 
-OnError:
+final:
     if (lut1)
     {
         vxReleaseLUT(&lut1);
@@ -262,7 +113,7 @@ OnError:
 #endif
 } /* _setup() */
 
-#define REGISTER_ELTWISE_UNARY_OPENVX_KERNEL(KERNEL_NAME, UNARY_FUNC) \
+#define REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL(KERNEL_NAME, UNARY_FUNC) \
     static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \
         ( \
         vsi_nn_graph_t              * graph, \
@@ -279,14 +130,136 @@ OnError:
     } \
     REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup )
 
-REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( mish,         mish_eval )
-//REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( exp,          exp_eval )
-REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( log,          log_eval )
-REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( elu,          elu_eval )
-REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( neg,          neg_eval )
-REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( hard_sigmoid, hsigmoid_eval )
-REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( gelu,         gelu_eval )
-REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( hard_gelu,    hgelu_eval )
+REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( mish,         VSI_NN_KERNEL_LUT_MISH )
+//REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( exp,          VSI_NN_KERNEL_LUT_EXP )
+REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( log,          VSI_NN_KERNEL_LUT_LOG )
+REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( elu,          VSI_NN_KERNEL_LUT_ELU )
+REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( neg,          VSI_NN_KERNEL_LUT_NEG )
+REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( hard_sigmoid, VSI_NN_KERNEL_LUT_HSIGMOID )
+REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( gelu,         VSI_NN_KERNEL_LUT_GELU )
+REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( hard_gelu,    VSI_NN_KERNEL_LUT_HGELU )
+REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( erf,          VSI_NN_KERNEL_LUT_ERF )
+REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( relu_keras,   VSI_NN_KERNEL_LUT_RELU_KERAS )
+REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL( clip,         VSI_NN_KERNEL_LUT_CLIP )
+
+#undef REGISTER_ELTWISE_UNARY_LUT_OPENVX_KERNEL
+
+#define REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( kernel_name )   \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ); \
+    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        )
+
+REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( abs )
+{
+    vx_node node = NULL;
+    vsi_size_t input_size[VSI_NN_MAX_DIM_NUM] = {0};
+    uint32_t dims = 0;
+    vx_tensor input = NULL, input0 = NULL;
+    vx_tensor output = NULL, output0 = NULL;
+
+    if (inputs[0]->attr.dim_num > 4)
+    {
+        input_size[0] = vsi_nn_GetElementNum(inputs[0]) /
+            inputs[0]->attr.size[inputs[0]->attr.dim_num - 1];
+        input_size[1] = inputs[0]->attr.size[inputs[0]->attr.dim_num - 1];
+        dims = 2;
+#ifdef VSI_40BIT_VA_SUPPORT
+        input = vxReshapeTensor(inputs[0]->t, input_size, dims);
+        output = vxReshapeTensor(outputs[0]->t, input_size, dims);
+#else
+        input = vxReshapeTensor(inputs[0]->t, (vx_int32*)input_size, (vx_uint32)dims);
+        output = vxReshapeTensor(outputs[0]->t, (vx_int32*)input_size, (vx_uint32)dims);
+#endif
+        input0 = input;
+        output0 = output;
+    }
+    else
+    {
+        input0 = inputs[0]->t;
+        output0 = outputs[0]->t;
+    }
+
+    node = vxLeakyReluLayer(
+        graph->g,
+        input0,
+        -1,
+        output0
+        );
+
+    if (input)  vxReleaseTensor(&input);
+    if (output) vxReleaseTensor(&output);
+
+    return (vsi_nn_kernel_node_t)node;
+} /* abs() */
+
+REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( linear )
+{
+    vx_node node = NULL;
+    float a_v = vsi_nn_kernel_param_get_float32( params, "a_v" );
+    float b_v = vsi_nn_kernel_param_get_float32( params, "b_v" );
+
+    node = vxActivationLayer(
+        graph->g,
+        inputs[0]->t,
+        VX_CONVOLUTIONAL_NETWORK_ACTIVATION_LINEAR,
+        a_v,
+        b_v,
+        outputs[0]->t
+        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* linear() */
+
+REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( sigmoid )
+{
+    vx_node node = NULL;
+
+    node = vxActivationLayer(
+        graph->g,
+        inputs[0]->t,
+        VX_CONVOLUTIONAL_NETWORK_ACTIVATION_LOGISTIC,
+        0,
+        0,
+        outputs[0]->t
+        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* sigmoid() */
+
+REGISTER_ELTWISE_UNARY_OPENVX_KERNEL( tanh )
+{
+    vx_node node = NULL;
+    float scale_a = vsi_nn_kernel_param_get_float32( params, "scale_a" );
+    float scale_b = vsi_nn_kernel_param_get_float32( params, "scale_b" );
+
+    node = vxActivationLayer(
+        graph->g,
+        inputs[0]->t,
+        VX_CONVOLUTIONAL_NETWORK_ACTIVATION_HYPERBOLIC_TAN,
+        scale_a,
+        scale_b,
+        outputs[0]->t
+        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* tanh() */
 
 #undef REGISTER_ELTWISE_UNARY_OPENVX_KERNEL
-
diff --git a/src/tim/vx/internal/src/kernel/vx/erf_vx.c b/src/tim/vx/internal/src/kernel/vx/erf_vx.c
deleted file mode 100644
index f33fa23..0000000
--- a/src/tim/vx/internal/src/kernel/vx/erf_vx.c
+++ /dev/null
@@ -1,217 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_node.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include <float.h>
-#include "utils/vsi_nn_dtype_util_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-typedef struct _sort_lut_s
-{
-    float index;
-    float val;
-} sort_lut;
-
-static float erf_eval(float _x)
-{
-    float x = vsi_clamp(_x, -2, 2);
-    float res = 0;
-    float tmp = x;
-    float factorial = 1; /*n!*/
-    float x_pow = x;
-    int32_t one = 1;
-    int32_t n = 1;
-
-    while (vsi_abs(tmp) > 1e-5)
-    {
-        res += tmp;
-
-        factorial *= n;
-        one *= -1;
-        x_pow *= x * x;
-        tmp = one / factorial * x_pow / ( 2 * n + 1);
-
-        n ++;
-    }
-#define MUL2_RSQRTPI    (1.1283791670955126f)
-
-    res *= MUL2_RSQRTPI;
-
-    return res;
-}
-
-#ifdef VX_USER_LOOKUP_TABLE_SUPPORT
-static int32_t _lut_comparator(const void *pa, const void *pb)
-{
-    sort_lut a = *(sort_lut *)pa;
-    sort_lut b = *(sort_lut *)pb;
-    float diff = a.index - b.index;
-    if ( diff > 0 )
-    {
-        return 1;
-    }
-    else if ( diff < 0 )
-    {
-        return -1;
-    }
-
-    return 0;
-}
-
-static void _set_table_lookup(float func(float), float *index, float *value)
-{
-#define VSI_NN_MAX_LUT_SIZE     (1024)
-#define FLT16_MAX               (57344)
-#define FLT16_MIN               (-57344)
-    uint32_t i = 0;
-    sort_lut *lut = (sort_lut *)calloc(VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut));
-
-    for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++)
-    {
-        int16_t val = (int16_t)(i << 6);
-        lut[i].index = fp16_to_fp32(val);
-        lut[i].val = func(lut[i].index);
-    }
-
-    for (i = 0x0; i < 0x10; i++)
-    {
-        lut[i].index = 0;
-        lut[i].val = func(lut[i].index);
-    }
-
-    for (i = 0x1F0; i < 0x200; i++)
-    {
-        lut[i].index = FLT16_MAX;
-        lut[i].val = func(lut[i].index);
-    }
-
-    for (i = 0x3F0; i < 0x400; i++)
-    {
-        lut[i].index = FLT16_MIN;
-        lut[i].val = func(lut[i].index);
-    }
-
-    qsort(lut, VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut), _lut_comparator);
-
-    for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++)
-    {
-        index[i] = lut[i].index;
-        value[i] = lut[i].val;
-    }
-
-    vsi_nn_safe_free(lut);
-
-#undef VSI_NN_MAX_LUT_SIZE
-#undef FLT16_MIN
-#undef FLT16_MAX
-}
-#endif
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel,
-    float                      func(float)
-    )
-{
-#ifdef VX_USER_LOOKUP_TABLE_SUPPORT
-    vx_lut lut1 = NULL;
-    vx_lut lut2 = NULL;
-    vx_node node = NULL;
-    float index[1024] = {0};
-    float value[1024] = {0};
-
-    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32   ||
-         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 )
-    {
-        return NULL;
-    }
-
-    _set_table_lookup(func, index, value);
-
-    lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024);
-    lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024);
-    if( NULL == lut1 || NULL == lut2 )
-    {
-        VSILOGE("create lut object fail.");
-        goto OnError;
-    }
-
-    vxCopyLUT(lut1, (void*)&index, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyLUT(lut2, (void*)&value, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
-
-    node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t);
-    if( NULL == node )
-    {
-        VSILOGE("Call vxTensorTableLookupLayer fail.");
-        goto OnError;
-    }
-
-OnError:
-    if (lut1)
-    {
-        vxReleaseLUT(&lut1);
-        lut1 = NULL;
-    }
-    if (lut2)
-    {
-        vxReleaseLUT(&lut2);
-        lut2 = NULL;
-    }
-    return (vsi_nn_kernel_node_t)node;
-#else
-    return NULL;
-#endif
-} /* _setup() */
-
-#define REGISTER_CLIP_OPENVX_KERNEL(KERNEL_NAME, UNARY_FUNC) \
-    static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \
-        ( \
-        vsi_nn_graph_t              * graph, \
-        vsi_nn_tensor_t            ** inputs, \
-        size_t                        input_num, \
-        vsi_nn_tensor_t            ** outputs, \
-        size_t                        output_num, \
-        const vsi_nn_kernel_param_t * params, \
-        vsi_nn_kernel_t             * kernel \
-        ) \
-    { \
-        return _setup(graph, inputs, input_num, outputs, output_num, \
-                params, kernel, UNARY_FUNC); \
-    } \
-    REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup )
-
-REGISTER_CLIP_OPENVX_KERNEL( erf, erf_eval )
-
-#undef REGISTER_CLIP_OPENVX_KERNEL
diff --git a/src/tim/vx/internal/src/kernel/vx/l2_normalize_vx.c b/src/tim/vx/internal/src/kernel/vx/l2_normalize_vx.c
new file mode 100644
index 0000000..5133dab
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/vx/l2_normalize_vx.c
@@ -0,0 +1,127 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+#define REGISTER_L2_NORMALIZE_OPENVX_KERNEL( kernel_name )   \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ); \
+    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        )
+
+REGISTER_L2_NORMALIZE_OPENVX_KERNEL( l2_norm )
+{
+    vx_node node = NULL;
+    int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
+#ifdef VX_L2NORM_AXIS_PARAMETER_SUPPORT
+    vx_nn_l2norm_params_t param;
+
+    param.axis = axis;
+
+    node = vxL2NormalizeLayer2(
+        graph->g,
+        inputs[0]->t,
+        &param,
+        sizeof(vx_nn_l2norm_params_t),
+        outputs[0]->t
+        );
+#else
+    uint32_t i = 0;
+    uint32_t sizes[VSI_NN_MAX_DIM_NUM] = {1};
+    uint32_t innerSize = 1;
+    uint32_t outerSize = 1;
+    uint32_t axisSize  = 1;
+    vx_tensor vx_input = NULL;
+    vx_tensor vx_output = NULL;
+    vx_tensor input = inputs[0]->t;
+    vx_tensor output = outputs[0]->t;
+
+    if (axis != 2)
+    {
+        axisSize  = inputs[0]->attr.size[axis];
+
+        for (i = 0; i < (uint32_t)axis; i++)
+        {
+            innerSize *= inputs[0]->attr.size[i];
+        }
+
+        for (i = (uint32_t)(axis + 1); i < inputs[0]->attr.dim_num; i++)
+        {
+            outerSize *= inputs[0]->attr.size[i];
+        }
+
+        sizes[0] = innerSize;
+        sizes[1] = 1;
+        sizes[2] = axisSize;
+        sizes[3] = outerSize;
+
+        vx_input = vxReshapeTensor(inputs[0]->t, (int32_t *)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4));
+        vx_output = vxReshapeTensor(outputs[0]->t, (int32_t *)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4));
+
+        input = vx_input;
+        output = vx_output;
+    }
+
+    node = vxL2NormalizeLayer(
+        graph->g,
+        input,
+        output
+        );
+
+    if (vx_input) vxReleaseTensor(&vx_input);
+    if (vx_output) vxReleaseTensor(&vx_output);
+#endif
+
+    if( NULL == node )
+    {
+        VSILOGE("Call vxSoftmaxLayer2 fail.(softmax)");
+    }
+
+    return (vsi_nn_kernel_node_t)node;
+} /* l2_norm() */
+
+#undef REGISTER_L2_NORMALIZE_OPENVX_KERNEL
diff --git a/src/tim/vx/internal/src/kernel/vx/relu_keras_vx.c b/src/tim/vx/internal/src/kernel/vx/relu_keras_vx.c
deleted file mode 100644
index 9c5b0cb..0000000
--- a/src/tim/vx/internal/src/kernel/vx/relu_keras_vx.c
+++ /dev/null
@@ -1,200 +0,0 @@
-/****************************************************************************
-*
-*    Copyright (c) 2020 Vivante Corporation
-*
-*    Permission is hereby granted, free of charge, to any person obtaining a
-*    copy of this software and associated documentation files (the "Software"),
-*    to deal in the Software without restriction, including without limitation
-*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-*    and/or sell copies of the Software, and to permit persons to whom the
-*    Software is furnished to do so, subject to the following conditions:
-*
-*    The above copyright notice and this permission notice shall be included in
-*    all copies or substantial portions of the Software.
-*
-*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-*    DEALINGS IN THE SOFTWARE.
-*
-*****************************************************************************/
-
-#include "vsi_nn_types.h"
-#include "vsi_nn_tensor.h"
-#include "vsi_nn_node.h"
-#include "vsi_nn_log.h"
-#include "vsi_nn_prv.h"
-#include <float.h>
-#include "utils/vsi_nn_dtype_util_prv.h"
-#include "vsi_nn_tensor_util.h"
-#include "kernel/vsi_nn_kernel.h"
-
-typedef struct _sort_lut_s
-{
-    float index;
-    float val;
-} sort_lut;
-
-static float relu_keras_eval(float val, float alpha, float threshold, float max)
-{
-    val = vsi_nn_min(val, max);
-    val = val < threshold ? alpha * (val - threshold) : val;
-    return val;
-}
-
-#ifdef VX_USER_LOOKUP_TABLE_SUPPORT
-static int32_t _lut_comparator(const void *pa, const void *pb)
-{
-    sort_lut a = *(sort_lut *)pa;
-    sort_lut b = *(sort_lut *)pb;
-    float diff = a.index - b.index;
-    if ( diff > 0 )
-    {
-        return 1;
-    }
-    else if ( diff < 0 )
-    {
-        return -1;
-    }
-
-    return 0;
-}
-
-static void _set_table_lookup(float func(float, float, float, float),
-                              float *index, float *value, float alpha, float threshold, float max)
-{
-#define VSI_NN_MAX_LUT_SIZE     (1024)
-#define FLT16_MAX               (57344)
-#define FLT16_MIN               (-57344)
-    uint32_t i = 0;
-    sort_lut *lut = (sort_lut *)calloc(VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut));
-
-    for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++)
-    {
-        int16_t val = (int16_t)(i << 6);
-        lut[i].index = fp16_to_fp32(val);
-        lut[i].val = func(lut[i].index, alpha, threshold, max);
-    }
-
-    for (i = 0x0; i < 0x10; i++)
-    {
-        lut[i].index = 0;
-        lut[i].val = func(lut[i].index, alpha, threshold, max);
-    }
-
-    for (i = 0x1F0; i < 0x200; i++)
-    {
-        lut[i].index = FLT16_MAX;
-        lut[i].val = func(lut[i].index, alpha, threshold, max);
-    }
-
-    for (i = 0x3F0; i < 0x400; i++)
-    {
-        lut[i].index = FLT16_MIN;
-        lut[i].val = func(lut[i].index, alpha, threshold, max);
-    }
-
-    qsort(lut, VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut), _lut_comparator);
-
-    for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++)
-    {
-        index[i] = lut[i].index;
-        value[i] = lut[i].val;
-    }
-
-    vsi_nn_safe_free(lut);
-
-#undef VSI_NN_MAX_LUT_SIZE
-#undef FLT16_MIN
-#undef FLT16_MAX
-}
-#endif
-
-static vsi_nn_kernel_node_t _setup
-    (
-    vsi_nn_graph_t              * graph,
-    vsi_nn_tensor_t            ** inputs,
-    size_t                        input_num,
-    vsi_nn_tensor_t            ** outputs,
-    size_t                        output_num,
-    const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel,
-    float                      func(float, float, float, float)
-    )
-{
-#ifdef VX_USER_LOOKUP_TABLE_SUPPORT
-    vx_lut lut1 = NULL;
-    vx_lut lut2 = NULL;
-    vx_node node = NULL;
-    float   alpha      = vsi_nn_kernel_param_get_float32( params, "alpha" );
-    float   max  = vsi_nn_kernel_param_get_float32( params, "max_value" );
-    float   threshold  = vsi_nn_kernel_param_get_float32( params, "threshold" );
-    float index[1024] = {0};
-    float value[1024] = {0};
-
-    if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32   ||
-         outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 )
-    {
-        return NULL;
-    }
-
-    _set_table_lookup(func, index, value, alpha, threshold, max);
-
-    lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024);
-    lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024);
-    if( NULL == lut1 || NULL == lut2 )
-    {
-        VSILOGE("create lut object fail.");
-        goto OnError;
-    }
-
-    vxCopyLUT(lut1, (void*)&index, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyLUT(lut2, (void*)&value, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
-
-    node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t);
-    if( NULL == node )
-    {
-        VSILOGW("Call vxTensorTableLookupLayer fail.");
-        goto OnError;
-    }
-
-OnError:
-    if (lut1)
-    {
-        vxReleaseLUT(&lut1);
-        lut1 = NULL;
-    }
-    if (lut2)
-    {
-        vxReleaseLUT(&lut2);
-        lut2 = NULL;
-    }
-    return (vsi_nn_kernel_node_t)node;
-#else
-    return NULL;
-#endif
-} /* _setup() */
-
-#define REGISTER_KERAS_RELU_OPENVX_KERNEL(KERNEL_NAME, UNARY_FUNC) \
-    static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \
-        ( \
-        vsi_nn_graph_t              * graph, \
-        vsi_nn_tensor_t            ** inputs, \
-        size_t                        input_num, \
-        vsi_nn_tensor_t            ** outputs, \
-        size_t                        output_num, \
-        const vsi_nn_kernel_param_t * params, \
-        vsi_nn_kernel_t             * kernel \
-        ) \
-    { \
-        return _setup(graph, inputs, input_num, outputs, output_num, \
-                params, kernel, UNARY_FUNC); \
-    } \
-    REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup )
-
-REGISTER_KERAS_RELU_OPENVX_KERNEL( relu_keras, relu_keras_eval )
-
-#undef REGISTER_KERAS_RELU_OPENVX_KERNEL
diff --git a/src/tim/vx/internal/src/kernel/vx/softmax_vx.c b/src/tim/vx/internal/src/kernel/vx/softmax_vx.c
new file mode 100644
index 0000000..f097fbb
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/vx/softmax_vx.c
@@ -0,0 +1,122 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+#define REGISTER_SOFTMAX_OPENVX_KERNEL( kernel_name )   \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ); \
+    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        )
+
+REGISTER_SOFTMAX_OPENVX_KERNEL( softmax )
+{
+    vx_node node = NULL;
+    float beta = vsi_nn_kernel_param_get_float32(params, "beta");
+    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
+    uint32_t rank_in = 0;
+    int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
+    int32_t new_axis = 0;
+    size_t size = sizeof(vx_nn_softmax_params_t);
+#ifdef VX_SOFTMAX_AXIS_PARAMETER_SUPPORT
+    vx_nn_softmax_params_ext_t paramExt;
+    vx_nn_softmax_params_t *param = (vx_nn_softmax_params_t *)&paramExt;
+    paramExt.base.beta = beta;
+    paramExt.axis = axis;
+    size = sizeof(vx_nn_softmax_params_ext_t);
+#else
+    vx_nn_softmax_params_t base;
+    vx_nn_softmax_params_t *param = &base;
+
+    memset(&base, 0, sizeof(vx_nn_softmax_params_t));
+    base.beta = beta;
+#endif
+
+    vsi_nn_kernel_optimize_softmax_shape(
+       inputs[0]->attr.size, inputs[0]->attr.dim_num, axis,
+       shapes[0], &rank_in, &new_axis);
+
+    if (new_axis == 1)
+    {
+        int32_t i = 0;
+        new_axis ++;
+        rank_in ++;
+        for (i = rank_in - 1; i > 1; i--)
+        {
+            shapes[0][i] = shapes[0][i - 1];
+        }
+        shapes[0][1] = 1;
+    }
+
+#ifdef VX_SOFTMAX_AXIS_PARAMETER_SUPPORT
+    paramExt.axis = new_axis;
+#endif
+
+    reshape_tensors[0] = vsi_nn_reshape_tensor( graph,
+        inputs[0], shapes[0], rank_in );
+    reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
+        outputs[0], shapes[0], rank_in );
+
+    node = vxSoftmaxLayer2( graph->g,
+        reshape_tensors[0]->t,
+        param,
+        size,
+        reshape_tensors[1]->t);
+    if( NULL == node )
+    {
+        VSILOGE("Call vxSoftmaxLayer2 fail.(softmax)");
+    }
+
+    vsi_nn_ReleaseTensor( &reshape_tensors[0] );
+    vsi_nn_ReleaseTensor( &reshape_tensors[1] );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* softmax() */
+
+#undef REGISTER_SOFTMAX_OPENVX_KERNEL
diff --git a/src/tim/vx/internal/src/kernel/vx/square_vx.c b/src/tim/vx/internal/src/kernel/vx/square_vx.c
index 839890b..572737c 100644
--- a/src/tim/vx/internal/src/kernel/vx/square_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/square_vx.c
@@ -1,6 +1,6 @@
 /****************************************************************************
 *
-*    Copyright (c) 2020 Vivante Corporation
+*    Copyright (c) 2021 Vivante Corporation
 *
 *    Permission is hereby granted, free of charge, to any person obtaining a
 *    copy of this software and associated documentation files (the "Software"),
@@ -30,85 +30,9 @@
 #include <float.h>
 #include "utils/vsi_nn_dtype_util_prv.h"
 #include "vsi_nn_tensor_util.h"
+#include "vsi_nn_error.h"
 #include "kernel/vsi_nn_kernel.h"
-
-typedef struct _sort_lut_s
-{
-    float index;
-    float val;
-} sort_lut;
-
-static float square_eval(float x)
-{
-    return x * x;
-}
-
-#ifdef VX_USER_LOOKUP_TABLE_SUPPORT
-static int32_t _lut_comparator(const void *pa, const void *pb)
-{
-    sort_lut a = *(sort_lut *)pa;
-    sort_lut b = *(sort_lut *)pb;
-    float diff = a.index - b.index;
-    if ( diff > 0 )
-    {
-        return 1;
-    }
-    else if ( diff < 0 )
-    {
-        return -1;
-    }
-
-    return 0;
-}
-
-static void _set_table_lookup(float func(float), float *index, float *value)
-{
-#define VSI_NN_MAX_LUT_SIZE     (1024)
-#define FLT16_MAX               (57344)
-#define FLT16_MIN               (-57344)
-    uint32_t i = 0;
-    sort_lut *lut = (sort_lut *)calloc(VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut));
-
-    for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++)
-    {
-        int16_t val = (int16_t)(i << 6);
-        lut[i].index = fp16_to_fp32(val);
-        lut[i].val = func(lut[i].index);
-    }
-
-    for (i = 0x0; i < 0x10; i++)
-    {
-        lut[i].index = 0;
-        lut[i].val = func(lut[i].index);
-    }
-
-    for (i = 0x1F0; i < 0x200; i++)
-    {
-        lut[i].index = FLT16_MAX;
-        lut[i].val = func(lut[i].index);
-    }
-
-    for (i = 0x3F0; i < 0x400; i++)
-    {
-        lut[i].index = FLT16_MIN;
-        lut[i].val = func(lut[i].index);
-    }
-
-    qsort(lut, VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut), _lut_comparator);
-
-    for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++)
-    {
-        index[i] = lut[i].index;
-        value[i] = lut[i].val;
-    }
-
-    vsi_nn_safe_free(lut);
-
-#undef VSI_NN_MAX_LUT_SIZE
-#undef FLT16_MIN
-#undef FLT16_MAX
-}
-#endif
+#include "kernel/vsi_nn_kernel_lut.h"
 
 static vsi_nn_kernel_node_t _setup
     (
@@ -118,16 +42,15 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_tensor_t            ** outputs,
     size_t                        output_num,
     const vsi_nn_kernel_param_t * params,
-    vsi_nn_kernel_t             * kernel,
-    float                      func(float)
+    vsi_nn_kernel_t             * kernel
     )
 {
     vx_node node = NULL;
 #ifdef VX_USER_LOOKUP_TABLE_SUPPORT
     vx_lut lut1 = NULL;
     vx_lut lut2 = NULL;
-    float index[1024] = {0};
-    float value[1024] = {0};
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_lut_params lut_param;
 
     if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32   ||
          outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 )
@@ -135,21 +58,21 @@ static vsi_nn_kernel_node_t _setup
         return NULL;
     }
 
-    _set_table_lookup(func, index, value);
+    lut_param.act_type = VSI_NN_KERNEL_LUT_SQUARE;
 
-    lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024);
-    lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024);
+    lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_KERNEL_LUT_MAX_SIZE);
+    lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, VSI_NN_KERNEL_LUT_MAX_SIZE);
     if( NULL == lut1 || NULL == lut2 )
     {
         VSILOGE("create lut object fail.");
-        goto OnError;
+        goto final;
     }
 
-    vxCopyLUT(lut1, (void*)&index, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
-    vxCopyLUT(lut2, (void*)&value, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
+    status = vsi_nn_kernel_lut(lut1, lut2, &lut_param);
+    CHECK_STATUS_FAIL_GOTO(status, final);
 
     node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t);
-    if( NULL == node )
+    if ( NULL == node )
     {
         node = vxActivationLayer(
             graph->g,
@@ -161,7 +84,7 @@ static vsi_nn_kernel_node_t _setup
             );
     }
 
-OnError:
+final:
     if (lut1)
     {
         vxReleaseLUT(&lut1);
@@ -187,7 +110,7 @@ OnError:
 #endif
 } /* _setup() */
 
-#define REGISTER_SQUARE_OPENVX_KERNEL(KERNEL_NAME, ACT_FUNC) \
+#define REGISTER_SQUARE_OPENVX_KERNEL(KERNEL_NAME) \
     static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \
         ( \
         vsi_nn_graph_t              * graph, \
@@ -200,10 +123,10 @@ OnError:
         ) \
     { \
         return _setup(graph, inputs, input_num, outputs, output_num, \
-                params, kernel, ACT_FUNC); \
+                params, kernel); \
     } \
     REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup )
 
-REGISTER_SQUARE_OPENVX_KERNEL( square, square_eval )
+REGISTER_SQUARE_OPENVX_KERNEL( square )
 
 #undef REGISTER_SQUARE_OPENVX_KERNEL
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl
index 34668c1..2177669 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl
@@ -56,13 +56,14 @@ inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride
 {
     int8 desc;
     int2 strides;
+    _viv_asm(COPY, desc, input, sizeof(desc));
+
 #if (USE_40BITS_VA==0)
     strides.x = desc.s1;
     strides.y = desc.s4;
 #else
     _viv_asm(GET_IMAGE_STRIDE, strides, input);
 #endif
-    _viv_asm(COPY, desc, input, sizeof(desc));
     uint address = as_uint(desc.s0);
 
     Tensor t =
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl
index bb31c02..5b90eb1 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_unary.cl
@@ -1,12 +1,11 @@
-
-float eltwise_unary_sin(float x, float alpha)
+float eltwise_unary_sin(float x, float alpha, float beta)
 {
     return native_sin(x);
 }
 
 #define logE        (1.44269502f)
 #define twoLogE     (logE * 2.0f)
-float eltwise_unary_exp(float x, float alpha)
+float eltwise_unary_exp(float x, float alpha, float beta)
 {
     x *= logE;
     x = exp2(x);
@@ -14,13 +13,13 @@ float eltwise_unary_exp(float x, float alpha)
 }
 
 #define rlogE    (0.693147182f)
-float eltwise_unary_log(float x, float alpha)
+float eltwise_unary_log(float x, float alpha, float beta)
 {
     x = log2(x);
     return x * rlogE;
 }
 
-float eltwise_unary_elu(float val, float alpha)
+float eltwise_unary_elu(float val, float alpha, float beta)
 {
     float x = val * logE;
     x = exp2(x) * alpha - alpha;
@@ -28,14 +27,14 @@ float eltwise_unary_elu(float val, float alpha)
     return val < 0 ? x : val;
 }
 
-float eltwise_unary_neg(float x, float alpha)
+float eltwise_unary_neg(float x, float alpha, float beta)
 {
     return x * -1;
 }
 
-float eltwise_unary_hard_sigmoid(float x, float alpha)
+float eltwise_unary_hard_sigmoid(float x, float alpha, float beta)
 {
-    x = 0.2 * x + 0.5;
+    x = alpha * x + beta;
     x = clamp(x, 0, 1);
     return x;
 }
@@ -57,14 +56,14 @@ float _tanh(float x, float alpha)
     return (2 * x - 1);
 }
 
-float eltwise_unary_mish(float x, float alpha)
+float eltwise_unary_mish(float x, float alpha, float beta)
 {
     float y = _softrelu(x, alpha);
     x = x * _tanh(y, alpha);
     return x;
 }
 
-float eltwise_unary_round(float x, float alpha)
+float eltwise_unary_round(float x, float alpha, float beta)
 {
     return convert_float(convert_int_rte(x));
 }
@@ -98,7 +97,7 @@ float erf_eval(float x)
     return res * MUL2_RSQRTPI;
 }
 #define RSQRT2      (0.70710678118654752440084436210485f)
-float eltwise_unary_gelu(float x, float alpha)
+float eltwise_unary_gelu(float x, float alpha, float beta)
 {
     x = 0.5f * x * (1 + erf_eval(x * RSQRT2));
 
@@ -106,7 +105,7 @@ float eltwise_unary_gelu(float x, float alpha)
 }
 
 #define SQRT_2_RCP_PI  0.7978845834732056f
-float eltwise_unary_hard_gelu(float x, float alpha)
+float eltwise_unary_hard_gelu(float x, float alpha, float beta)
 {
     float cdf = 0.5f + 0.5f * _tanh(SQRT_2_RCP_PI *
                         (x + 0.044715f * x * x * x), 0);
@@ -122,7 +121,8 @@ __kernel void func_name##_F32toF32 \
                  float           inputTail, \
                  float           outputScale, \
                  float           outputZP, \
-                 float           alpha \
+                 float           alpha, \
+                 float           beta \
     ) \
 { \
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
@@ -130,7 +130,7 @@ __kernel void func_name##_F32toF32 \
     float4 src = read_imagef(input, coord); \
  \
     float4 dst = 0; \
-    dst.x = eltwise_unary_##func_name(src.x, alpha); \
+    dst.x = eltwise_unary_##func_name(src.x, alpha, beta); \
  \
     write_imagef(output, coord, dst.xxxx); \
 }
@@ -154,7 +154,8 @@ __kernel void func_name##_F32toF32_2D \
                  float     inputTail, \
                  float     outputScale, \
                  float     outputZP, \
-                 float     alpha \
+                 float     alpha, \
+                 float           beta \
     ) \
 { \
     int2 coord =  (int2)(get_global_id(0), get_global_id(1)); \
@@ -162,7 +163,7 @@ __kernel void func_name##_F32toF32_2D \
     float4 src = read_imagef(input, coord); \
  \
     float4 dst = 0; \
-    dst.x = eltwise_unary_##func_name(src.x, alpha); \
+    dst.x = eltwise_unary_##func_name(src.x, alpha, beta); \
  \
     write_imagef(output, coord, dst.xxxx); \
 }
@@ -186,7 +187,8 @@ __kernel void func_name##_U8toU8 \
                  float           inputTail, \
                  float           outputScale, \
                  float           outputZP, \
-                 float           alpha \
+                 float           alpha, \
+                 float           beta \
     ) \
 { \
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
@@ -194,7 +196,7 @@ __kernel void func_name##_U8toU8 \
     uint4 src = read_imageui(input, coord); \
     float4 data = convert_float4(src) * inputScale - inputTail; \
  \
-    data.x = eltwise_unary_##func_name(data.x, alpha); \
+    data.x = eltwise_unary_##func_name(data.x, alpha, beta); \
     uint4 dst = convert_uint4(data * outputScale + outputZP); \
  \
     write_imageui(output, coord, dst); \
@@ -219,7 +221,8 @@ __kernel void func_name##_U8toU8_2D \
                  float     inputTail, \
                  float     outputScale, \
                  float     outputZP, \
-                 float     alpha \
+                 float     alpha, \
+                 float     beta \
     ) \
 { \
     int2 coord =  (int2)(get_global_id(0), get_global_id(1)); \
@@ -227,7 +230,7 @@ __kernel void func_name##_U8toU8_2D \
     uint4 src = read_imageui(input, coord); \
     float4 data = convert_float4(src) * inputScale - inputTail; \
  \
-    data.x = eltwise_unary_##func_name(data.x, alpha); \
+    data.x = eltwise_unary_##func_name(data.x, alpha, beta); \
     uint4 dst = convert_uint4(data * outputScale + outputZP); \
  \
     write_imageui(output, coord, dst); \
@@ -251,7 +254,8 @@ __kernel void neg_I32toI32
                  float           inputTail,
                  float           outputScale,
                  float           outputZP,
-                 float           alpha
+                 float           alpha,
+                 float           beta
     )
 {
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
@@ -270,7 +274,8 @@ __kernel void neg_I32toI32_2D
                  float     inputTail,
                  float     outputScale,
                  float     outputZP,
-                 float     alpha
+                 float     alpha,
+                 float     beta
     )
 {
     int2 coord =  (int2)(get_global_id(0), get_global_id(1));
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl b/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl
index 746a06e..64f6775 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/floordiv.cl
@@ -66,7 +66,10 @@ __kernel void floordiv_I32I32toU8(
     int4 src1;
     READ_IMAGEI_2DARRAY(src0, input, coord);
     READ_IMAGEI_2DARRAY(src1, input1, coord);
-    uint4 dst  = convert_uint4(floor(convert_float4(src0) / convert_float4(src1)) * outputScale + outputTail);
+    float4 in0 = convert_float4(src0) * input0Scale + input0Tail;
+    float4 in1 = convert_float4(src1) * input1Scale + input1Tail;
+    float4 out = floor(in0 / in1) * outputScale + outputTail;
+    uint4 dst = convert_uint4(out);
     write_imageui(output, coord, dst);
 }
 
@@ -84,7 +87,10 @@ __kernel void floordiv_I32I32toU8_2D(
     int2 coord =  (int2)(get_global_id(0), get_global_id(1));
     int4 src0 = read_imagei(input, coord);
     int4 src1 = read_imagei(input1, coord);
-    uint4 dst  = convert_uint4(floor(convert_float4(src0) / convert_float4(src1)) * outputScale + outputTail);
+    float4 in0 = convert_float4(src0) * input0Scale + input0Tail;
+    float4 in1 = convert_float4(src1) * input1Scale + input1Tail;
+    float4 out = floor(in0 / in1) * outputScale + outputTail;
+    uint4 dst = convert_uint4(out);
     write_imageui(output, coord, dst);
 }
 
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/grucell_activation_z_h.cl b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_activation_z_h.cl
new file mode 100644
index 0000000..dd2e562
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_activation_z_h.cl
@@ -0,0 +1,123 @@
+#define logE        (1.44269502f)
+#define twoLogE     (logE * 2.0f)
+
+float sigmoid(float x)
+{
+    x *= -logE;
+    x = 1 + exp2(x);
+    return 1 / x;
+}
+float hard_sigmoid(float x)
+{
+    x = 0.2 * x + 0.5;
+    x = clamp(x, 0, 1);
+    return x;
+}
+float tanh_func(float x)
+{
+    x *= -twoLogE;
+    x = 1 + exp2(x);
+    x = 1 / x;
+    return 2 * x - 1;
+}
+
+
+#define GRUCELL_ACTIVATION_U8_F32_U8(act_name, act_func) \
+__kernel void grucell_activation_z_h_U8_F32toU8_##act_name( \
+    __read_only  image2d_t        hstate_in, \
+    __read_only  image2d_t        input_z_conv, \
+    __read_only  image2d_t        input_h_conv, \
+    __read_only  image2d_t        hstate_z_conv, \
+    __read_only  image2d_t        hstate_h_conv, \
+    __write_only image2d_t        output, \
+    __write_only image2d_t        hstate_out, \
+    float input_scale, float input_tail, float output_scale, float output_zp) \
+{ \
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
+    float4  src0, src1, src2, src3; \
+    float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
+    float4 h_tm = convert_float4(read_imageui(hstate_in, coord_in.xy)); \
+    float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \
+    float4 h0 = read_imagef(input_h_conv, coord_in.xy); \
+    float4 z0 = read_imagef(input_z_conv, coord_in.xy); \
+    float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \
+ \
+    h_tm = h_tm * input_scale + input_tail; \
+    float4 h = h0 + h1; \
+    float4 z = z0 + z1; \
+    z.x = act_func(z.x); \
+    h = tanh_func(h.x); \
+    float4 dst = (1 - z ) * h + z * h_tm; \
+    dst = dst * output_scale + output_zp; \
+    uint4 result = convert_uint4_sat_rte(dst); \
+    write_imageui(output, coord_in.xy, result); \
+    write_imageui(hstate_out, coord_in.xy, result); \
+}
+GRUCELL_ACTIVATION_U8_F32_U8(SIGMOID, sigmoid)
+//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)
+
+#define GRUCELL_ACTIVATION_F32_F32_F32(act_name, act_func) \
+__kernel void grucell_activation_z_h_F32_F32toF32_##act_name( \
+    __read_only  image2d_t        hstate_in, \
+    __read_only  image2d_t        input_z_conv, \
+    __read_only  image2d_t        input_h_conv, \
+    __read_only  image2d_t        hstate_z_conv, \
+    __read_only  image2d_t        hstate_h_conv, \
+    __write_only image2d_t        output, \
+    __write_only image2d_t        hstate_out, \
+    float input_scale, float input_tail, float output_scale, float output_zp) \
+{ \
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
+    float4  src0, src1, src2, src3; \
+    float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
+    float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \
+    float4 h0 = read_imagef(input_h_conv, coord_in.xy); \
+    float4 z0 = read_imagef(input_z_conv, coord_in.xy); \
+    float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \
+    float4 h_tm = read_imagef(hstate_in, coord_in.xy); \
+ \
+    float4 h = h0 + h1; \
+    float4 z = z0 + z1; \
+    z.x = act_func(z.x); \
+    h = tanh_func(h.x); \
+    float4 dst = (1 - z ) * h + z * h_tm; \
+    write_imagef(output, coord_in.xy, dst); \
+    write_imagef(hstate_out, coord_in.xy, dst); \
+}
+
+GRUCELL_ACTIVATION_F32_F32_F32(SIGMOID, sigmoid)
+//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)
+
+#define GRUCELL_ACTIVATION_I32_F32_I32(act_name, act_func) \
+__kernel void grucell_activation_z_h_I32_F32toI32_##act_name( \
+    __read_only  image2d_t        hstate_in, \
+    __read_only  image2d_t        input_z_conv, \
+    __read_only  image2d_t        input_h_conv, \
+    __read_only  image2d_t        hstate_z_conv, \
+    __read_only  image2d_t        hstate_h_conv, \
+    __write_only image2d_t        output, \
+    __write_only image2d_t        hstate_out, \
+    float input_scale, float input_tail, float output_scale, float output_zp) \
+{ \
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
+    float4  src0, src1, src2, src3; \
+    float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
+    float4 h_tm = convert_float4(read_imagei(hstate_in, coord_in.xy)); \
+    float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \
+    float4 h0 = read_imagef(input_h_conv, coord_in.xy); \
+    float4 z0 = read_imagef(input_z_conv, coord_in.xy); \
+    float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \
+ \
+    h_tm = h_tm * input_scale + input_tail; \
+    float4 h = h0 + h1; \
+    float4 z = z0 + z1; \
+    z.x = act_func(z.x); \
+    h = tanh_func(h.x); \
+    float4 dst = (1 - z ) * h + z * h_tm; \
+    dst = dst * output_scale + output_zp; \
+    int4 result = convert_int4_sat_rte(dst); \
+    write_imagei(output, coord_in.xy, result); \
+    write_imagei(hstate_out, coord_in.xy, result); \
+}
+GRUCELL_ACTIVATION_I32_F32_I32(SIGMOID, sigmoid)
+//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/grucell_h_times_activation_r.cl b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_h_times_activation_r.cl
new file mode 100644
index 0000000..e36024f
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_h_times_activation_r.cl
@@ -0,0 +1,87 @@
+#define logE        (1.44269502f)
+#define twoLogE     (logE * 2.0f)
+
+float sigmoid(float x)
+{
+    x *= -logE;
+    x = 1 + exp2(x);
+    return 1 / x;
+}
+float hard_sigmoid(float x)
+{
+    x = 0.2 * x + 0.5;
+    x = clamp(x, 0, 1);
+    return x;
+}
+
+#define GRUCELL_H_TIMES_R_U8_F32_F32(act_name, act_func) \
+__kernel void grucell_h_times_activation_r_U8_F32toF32_##act_name( \
+    __read_only  image2d_t        hstate_in, \
+    __read_only  image2d_t        input_r_conv, \
+    __read_only  image2d_t        hstate_r_conv, \
+    __write_only image2d_t        output, \
+    float input_scale, float input_tail) \
+{ \
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
+    float4  src0, src1, src2, src3; \
+    float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
+    float4 r0 = read_imagef(input_r_conv, coord_in.xy); \
+    float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \
+    float4 h_tm = convert_float4(read_imageui(hstate_in, coord_in.xy)); \
+ \
+    float4 r = r0 + r1; \
+    r.x = act_func(r.x); \
+    h_tm = h_tm * input_scale + input_tail; \
+    float4 r_times_h = r * h_tm; \
+    write_imagef(output, coord_in.xy, r_times_h); \
+}
+GRUCELL_H_TIMES_R_U8_F32_F32(SIGMOID, sigmoid)
+//GRUCELL_H_TIMES_R_U8_F32_F32(HARD_SIGMOID, hard_sigmoid)
+
+#define GRUCELL_H_TIMES_R_F32_F32_F32(act_name, act_func) \
+__kernel void grucell_h_times_activation_r_F32_F32toF32_##act_name( \
+    __read_only  image2d_t hstate_in, \
+    __read_only  image2d_t input_r_conv, \
+    __read_only  image2d_t hstate_r_conv, \
+    __write_only image2d_t output, \
+    float input_scale, float input_tail) \
+{ \
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
+    float4  src0, src1, src2, src3; \
+    float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
+    float4 r0 = read_imagef(input_r_conv, coord_in.xy); \
+    float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \
+    float4 h_tm = read_imagef(hstate_in, coord_in.xy); \
+ \
+    float4 r = r0 + r1; \
+    r.x = act_func(r.x); \
+    float4 r_times_h = r * h_tm; \
+    write_imagef(output, coord_in.xy, r_times_h); \
+}
+
+GRUCELL_H_TIMES_R_F32_F32_F32(SIGMOID, sigmoid)
+//GRUCELL_H_TIMES_R_F32_F32_F32(HARD_SIGMOID, hard_sigmoid)
+
+#define GRUCELL_H_TIMES_R_I32_F32_F32(act_name, act_func) \
+__kernel void grucell_h_times_activation_r_I32_F32toI32_##act_name( \
+    __read_only  image2d_t        hstate_in, \
+    __read_only  image2d_t        input_r_conv, \
+    __read_only  image2d_t        hstate_r_conv, \
+    __write_only image2d_t        output, \
+    float input_scale, float input_tail) \
+{ \
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
+    float4  src0, src1, src2, src3; \
+    float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
+    float4 r0 = read_imagef(input_r_conv, coord_in.xy); \
+    float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \
+    float4 h_tm = convert_float4(read_imagei(hstate_in, coord_in.xy)); \
+ \
+    float4 r = r0 + r1; \
+    r.x = act_func(r.x); \
+    h_tm = h_tm * input_scale + input_tail; \
+    float4 r_times_h = r * h_tm; \
+    write_imagef(output, coord_in.xy, r_times_h); \
+}
+GRUCELL_H_TIMES_R_I32_F32_F32(SIGMOID, sigmoid)
+//GRUCELL_H_TIMES_R_I32_F32_F32(HARD_SIGMOID, hard_sigmoid)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/grucell_reset_after_activation.cl b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_reset_after_activation.cl
new file mode 100644
index 0000000..a47b32d
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/grucell_reset_after_activation.cl
@@ -0,0 +1,144 @@
+#define logE        (1.44269502f)
+#define twoLogE     (logE * 2.0f)
+
+float sigmoid(float x)
+{
+    x *= -logE;
+    x = 1 + exp2(x);
+    return 1 / x;
+}
+float hard_sigmoid(float x)
+{
+    x = 0.2 * x + 0.5;
+    x = clamp(x, 0, 1);
+    return x;
+}
+float tanh_func(float x)
+{
+    x *= -twoLogE;
+    x = 1 + exp2(x);
+    x = 1 / x;
+    return 2 * x - 1;
+}
+
+
+#define GRUCELL_ACTIVATION_U8_F32_U8(act_name, act_func) \
+__kernel void grucell_reset_after_activation_U8_F32toU8_##act_name( \
+    __read_only  image2d_t        hstate_in, \
+    __read_only  image2d_t        input_z_conv, \
+    __read_only  image2d_t        input_r_conv, \
+    __read_only  image2d_t        input_h_conv, \
+    __read_only  image2d_t        hstate_z_conv, \
+    __read_only  image2d_t        hstate_r_conv, \
+    __read_only  image2d_t        hstate_h_conv, \
+    __write_only image2d_t        output, \
+    __write_only image2d_t        hstate_out, \
+    float input_scale, float input_tail, float output_scale, float output_zp) \
+{ \
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
+    float4  src0, src1, src2, src3; \
+    float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
+    float4 r0 = read_imagef(input_r_conv, coord_in.xy); \
+    float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \
+    float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \
+    float4 h_tm = convert_float4(read_imageui(hstate_in, coord_in.xy)); \
+    float4 h0 = read_imagef(input_h_conv, coord_in.xy); \
+    float4 z0 = read_imagef(input_z_conv, coord_in.xy); \
+    float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \
+ \
+    float4 r = r0 + r1; \
+    r.x = act_func(r.x); \
+    h_tm = h_tm * input_scale + input_tail; \
+    float4 r_times_h = r * h1; \
+    float4 h = h0 + r_times_h; \
+    float4 z = z0 + z1; \
+    z.x = act_func(z.x); \
+    h = tanh_func(h.x); \
+    float4 dst = (1 - z ) * h + z * h_tm; \
+    dst = dst * output_scale + output_zp; \
+    uint4 result = convert_uint4_sat_rte(dst); \
+    write_imageui(output, coord_in.xy, result); \
+    write_imageui(hstate_out, coord_in.xy, result); \
+}
+GRUCELL_ACTIVATION_U8_F32_U8(SIGMOID, sigmoid)
+//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)
+
+#define GRUCELL_ACTIVATION_F32_F32_F32(act_name, act_func) \
+__kernel void grucell_reset_after_activation_F32_F32toF32_##act_name( \
+    __read_only  image2d_t        hstate_in, \
+    __read_only  image2d_t        input_z_conv, \
+    __read_only  image2d_t        input_r_conv, \
+    __read_only  image2d_t        input_h_conv, \
+    __read_only  image2d_t        hstate_z_conv, \
+    __read_only  image2d_t        hstate_r_conv, \
+    __read_only  image2d_t        hstate_h_conv, \
+    __write_only image2d_t        output, \
+    __write_only image2d_t        hstate_out, \
+    float input_scale, float input_tail, float output_scale, float output_zp) \
+{ \
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
+    float4  src0, src1, src2, src3; \
+    float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
+    float4 r0 = read_imagef(input_r_conv, coord_in.xy); \
+    float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \
+    float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \
+    float4 h_tm = read_imagef(hstate_in, coord_in.xy); \
+    float4 h0 = read_imagef(input_h_conv, coord_in.xy); \
+    float4 z0 = read_imagef(input_z_conv, coord_in.xy); \
+    float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \
+ \
+    float4 r = r0 + r1; \
+    r.x = act_func(r.x); \
+    float4 r_times_h = r * h1; \
+    float4 h = h0 + r_times_h; \
+    float4 z = z0 + z1; \
+    z.x = act_func(z.x); \
+    h = tanh_func(h.x); \
+    float4 dst = (1 - z ) * h + z * h_tm; \
+    write_imagef(output, coord_in.xy, dst); \
+    write_imagef(hstate_out, coord_in.xy, dst); \
+}
+
+GRUCELL_ACTIVATION_F32_F32_F32(SIGMOID, sigmoid)
+//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)
+
+#define GRUCELL_ACTIVATION_I32_F32_I32(act_name, act_func) \
+__kernel void grucell_reset_after_activation_I32_F32toI32_##act_name( \
+    __read_only  image2d_t        hstate_in, \
+    __read_only  image2d_t        input_z_conv, \
+    __read_only  image2d_t        input_r_conv, \
+    __read_only  image2d_t        input_h_conv, \
+    __read_only  image2d_t        hstate_z_conv, \
+    __read_only  image2d_t        hstate_r_conv, \
+    __read_only  image2d_t        hstate_h_conv, \
+    __write_only image2d_t        output, \
+    __write_only image2d_t        hstate_out, \
+    float input_scale, float input_tail, float output_scale, float output_zp) \
+{ \
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
+    float4  src0, src1, src2, src3; \
+    float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \
+    float4 r0 = read_imagef(input_r_conv, coord_in.xy); \
+    float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \
+    float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \
+    float4 h_tm = convert_float4(read_imagei(hstate_in, coord_in.xy)); \
+    float4 h0 = read_imagef(input_h_conv, coord_in.xy); \
+    float4 z0 = read_imagef(input_z_conv, coord_in.xy); \
+    float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \
+ \
+    float4 r = r0 + r1; \
+    r.x = act_func(r.x); \
+    h_tm = h_tm * input_scale + input_tail; \
+    float4 r_times_h = r * h1; \
+    float4 h = h0 + r_times_h; \
+    float4 z = z0 + z1; \
+    z.x = act_func(z.x); \
+    h = tanh_func(h.x); \
+    float4 dst = (1 - z ) * h + z * h_tm; \
+    dst = dst * output_scale + output_zp; \
+    int4 result = convert_int4_sat_rte(dst); \
+    write_imagei(output, coord_in.xy, result); \
+    write_imagei(hstate_out, coord_in.xy, result); \
+}
+GRUCELL_ACTIVATION_I32_F32_I32(SIGMOID, sigmoid)
+//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx
index dea29d2..8a56bb3 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_2d.vx
@@ -1,6 +1,7 @@
 #include "cl_viv_vx_ext.h"
 
 _viv_uniform float alpha;
+_viv_uniform float beta;
 
 float4 eltwise_unary_sin(float4 x)
 {
@@ -38,7 +39,7 @@ float4 eltwise_unary_neg(float4 x)
 
 float4 eltwise_unary_hard_sigmoid(float4 x)
 {
-    x = 0.2 * x + 0.5;
+    x = alpha * x + beta;
     x = clamp(x, 0, 1);
     return x;
 }
@@ -136,7 +137,8 @@ _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;
     __read_only  image2d_array_t  input, \
     __write_only image2d_array_t  output, \
                  int              type, \
-                 float            _alpha \
+                 float            _alpha, \
+                 float            _beta \
     ) \
 { \
     int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
@@ -285,7 +287,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;
     __read_only  image2d_array_t  input, \
     __write_only image2d_array_t  output, \
                  int              type, \
-                 float            _alpha \
+                 float            _alpha, \
+                 float            _beta \
     ) \
 { \
     int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx
index 6da7605..3faa1f5 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/eltwise_unary_3d.vx
@@ -1,6 +1,7 @@
 #include "cl_viv_vx_ext.h"
 
 _viv_uniform float alpha;
+_viv_uniform float beta;
 
 float4 eltwise_unary_sin(float4 x)
 {
@@ -38,7 +39,7 @@ float4 eltwise_unary_neg(float4 x)
 
 float4 eltwise_unary_hard_sigmoid(float4 x)
 {
-    x = 0.2 * x + 0.5;
+    x = alpha * x + beta;
     x = clamp(x, 0, 1);
     return x;
 }
@@ -136,7 +137,8 @@ __kernel void func_name##_##src_type_name##to##dst_type_name( \
     __read_only  image2d_array_t  input, \
     __write_only image2d_array_t  output, \
                  int              type, \
-                 float            _alpha \
+                 float            _alpha, \
+                 float            _beta \
     ) \
 { \
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
@@ -284,7 +286,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;
     __read_only  image2d_array_t  input, \
     __write_only image2d_array_t  output, \
                  int              type, \
-                 float            _alpha \
+                 float            _alpha, \
+                 float            _beta \
     ) \
 { \
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/extra_ending.vx b/src/tim/vx/internal/src/libnnext/ops/vx/extra_ending.vx
new file mode 100644
index 0000000..52f51e5
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/extra_ending.vx
@@ -0,0 +1,65 @@
+#include "cl_viv_vx_ext.h"
+
+__kernel void extra_ending_I16
+    (
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 data;
+    VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void extra_ending_F16
+    (
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 data;
+    VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void extra_ending_I8
+    (
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_char8 data;
+    VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void extra_ending_U8
+    (
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_uchar8 data;
+    VXC_ReadImage2DArray(data, input, coord, VXC_5BITOFFSET_XY(0, 0),\
+        VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    VXC_WriteImage2DArray(output, coord, data, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16_scale.vx
new file mode 100644
index 0000000..cb00ac9
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_f16_scale.vx
@@ -0,0 +1,174 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;
+
+_viv_uniform float outputScale;
+_viv_uniform int output_ZP;
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toF16(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,
+    float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
+    vxc_short8 src0;
+    vxc_half8 in_h;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    bias_f = read_imagef(bias, coord_para.xy);
+    scale_f = read_imagef(scale, coord_para.xy);
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_float4  tmpData0, tmpData1;
+    vxc_short8 outval;
+    half4 tmpVal0, tmpVal1;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
+    vxc_half8 dst;
+
+    _viv_asm(COPY, in_h, src0, 16);
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertEndInt16Fp32_4x4);
+
+    vxc_float4 norm;
+    norm = scale_vari * tmpData0 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = scale_vari * tmpData1 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+            uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toF16_2D(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,
+    float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidz = get_global_id(1);
+    int2 coord = (int2)(get_global_id(0), gidz);
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
+    vxc_short8 src0;
+    vxc_half8 in_h;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    bias_f = read_imagef(bias, coord_para.xy);
+    scale_f = read_imagef(scale, coord_para.xy);
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_float4  tmpData0, tmpData1;
+    vxc_short8 outval;
+    half4 tmpVal0, tmpVal1;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
+    vxc_half8 dst;
+
+    _viv_asm(COPY, in_h, src0, 16);
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        UniFP16toFP32Lo4_dp4x4);
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertEndInt16Fp32_4x4);
+    vxc_float4 norm;
+    norm = scale_vari * tmpData0 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = scale_vari * tmpData1 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+        uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toU8(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,
+    float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
+    vxc_short8 src0;
+    vxc_half8 in_h;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    bias_f = read_imagef(bias, coord_para.xy);
+    scale_f = read_imagef(scale, coord_para.xy);
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_float4  tmpData0, tmpData1;
+    vxc_uchar16 outval;
+    vxc_int4 tmpVal0, tmpVal1;
+    float alpha = outputScale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;
+
+    _viv_asm(COPY, in_h, src0, 16);
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertEndInt16Fp32_4x4);
+
+    vxc_float4 norm;
+    norm = alpha * tmpData0 + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = alpha * tmpData1 + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+            uniConvertInt32toUint8_2x8);
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toU8_2D(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,
+    float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidz = get_global_id(1);
+    int2 coord = (int2)(get_global_id(0), gidz);
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
+    vxc_short8 src0;
+    vxc_half8 in_h;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    bias_f = read_imagef(bias, coord_para.xy);
+    scale_f = read_imagef(scale, coord_para.xy);
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_float4  tmpData0, tmpData1;
+    vxc_uchar16 outval;
+    vxc_int4 tmpVal0, tmpVal1;
+    float alpha = outputScale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;
+
+    _viv_asm(COPY, in_h, src0, 16);
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            UniFP16toFP32Lo4_dp4x4);
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+            uniConvertEndInt16Fp32_4x4);
+    vxc_float4 norm;
+    norm = alpha * tmpData0 + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = alpha * tmpData1 + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+            uniConvertInt32toUint8_2x8);
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16_scale.vx
new file mode 100644
index 0000000..397a5f8
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i16_scale.vx
@@ -0,0 +1,191 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
+
+_viv_uniform float input_fl_scale;
+_viv_uniform float inOut_fl_scale;
+_viv_uniform float output_fl_scale;
+
+_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4;
+_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8;
+
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toF16(
+    image2d_array_t input,
+    image2d_t bias,
+    image2d_t scale,
+    image2d_t meanVari,
+    image2d_array_t output,
+              float eps,
+              int is2D,
+              float rSpaceOrg, int pStride)
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
+    vxc_short8 src0;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    bias_f = read_imagef(bias, coord_para.xy);
+    scale_f = read_imagef(scale, coord_para.xy);
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_float4  tmpData0, tmpData1;
+    vxc_short8 outval;
+    half4 tmpVal0, tmpVal1;
+    float alpha = input_fl_scale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
+    vxc_half8 dst;
+
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertInt16Fp32Fst_4x4);
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertInt16Fp32Secd_4x4);
+
+    vxc_float4 norm;
+    norm = alpha * tmpData0 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData1 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+        uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toF16_2D(
+    image2d_array_t input,
+    image2d_t bias,
+    image2d_t scale,
+    image2d_t meanVari,
+    image2d_array_t output,
+              float eps,
+              int is2D,
+              float rSpaceOrg, int pStride)
+{
+    int gidz = get_global_id(1);
+    int2 coord = (int2)(get_global_id(0), gidz);
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
+    vxc_short8 src0;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    bias_f = read_imagef(bias, coord_para.xy);
+    scale_f = read_imagef(scale, coord_para.xy);
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_float4  tmpData0, tmpData1;
+    vxc_short8 outval;
+    half4 tmpVal0, tmpVal1;
+    float alpha = input_fl_scale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
+    vxc_half8 dst;
+
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertInt16Fp32Fst_4x4);
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertInt16Fp32Secd_4x4);
+    vxc_float4 norm;
+    norm = alpha * tmpData0 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData1 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+        uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toI16(
+    image2d_array_t input,
+    image2d_t bias,
+    image2d_t scale,
+    image2d_t meanVari,
+    image2d_array_t output,
+              float eps,
+              int is2D,
+              float rSpaceOrg, int pStride)
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
+    vxc_short8 src0, src2;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    bias_f = read_imagef(bias, coord_para.xy);
+    scale_f = read_imagef(scale, coord_para.xy);
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_int4 tmpVal0, tmpVal1;
+    vxc_float4  tmpData0, tmpData1;
+    float alpha = inOut_fl_scale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;
+
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertInt16Fp32Fst_4x4);
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertInt16Fp32Secd_4x4);
+    vxc_float4 norm;
+    norm = tmpData0 * alpha + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = tmpData1 * alpha + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+        uniConvertInt32toInt16_2x8);
+    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toI16_2D(
+    image2d_array_t input,
+    image2d_t bias,
+    image2d_t scale,
+    image2d_t meanVari,
+    image2d_array_t output,
+              float eps,
+              int is2D,
+              float rSpaceOrg, int pStride)
+{
+    int gidz = get_global_id(1);
+    int2 coord = (int2)(get_global_id(0), gidz);
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
+    vxc_short8 src0, src2;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    bias_f = read_imagef(bias, coord_para.xy);
+    scale_f = read_imagef(scale, coord_para.xy);
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_int4 tmpVal0, tmpVal1;
+    vxc_float4  tmpData0, tmpData1;
+    float alpha = inOut_fl_scale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;
+
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertInt16Fp32Fst_4x4);
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+        uniConvertInt16Fp32Secd_4x4);
+    vxc_float4 norm;
+    norm = tmpData0 * alpha + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = tmpData1 * alpha + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\
+            uniConvertInt32toInt16_2x8);
+    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8_scale.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8_scale.vx
new file mode 100644
index 0000000..350e425
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_i8_scale.vx
@@ -0,0 +1,186 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform float input_fl_scale;
+
+_viv_uniform VXC_512Bits uniConvertDirInt8Fp32_4x4;
+_viv_uniform VXC_512Bits uniConvertEndInt8Fp32_4x4;
+_viv_uniform VXC_512Bits uniConvertTrdInt8Fp32_4x4;
+_viv_uniform VXC_512Bits uniConvertFthInt8Fp32_4x4;
+
+_viv_uniform float inOut_fl_scale;
+_viv_uniform float output_fl_scale;
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toF16(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
+    vxc_char16 src0;
+    vxc_short8 outval;
+    vxc_half8 dst;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    bias_f = read_imagef(bias, coord_para.xy);
+    scale_f = read_imagef(scale, coord_para.xy);
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
+    half4 tmpVal0, tmpVal1;
+    float alpha = input_fl_scale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
+
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
+
+    norm = alpha * tmpData0 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData1 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    coord.x += 8;
+    norm = alpha * tmpData2 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData3 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toF16_2D(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidz = get_global_id(1);
+    int2 coord = (int2)(get_global_id(0), gidz);
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
+    vxc_char16 src0;
+    vxc_short8 outval;
+    vxc_half8 dst;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    bias_f = read_imagef(bias, coord_para.xy);
+    scale_f = read_imagef(scale, coord_para.xy);
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
+    half4 tmpVal0, tmpVal1;
+    float alpha = input_fl_scale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
+
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
+    norm = alpha * tmpData0 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData1 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    coord.x += 8;
+    norm = alpha * tmpData2 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData3 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toI8(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
+    vxc_char16 src0, src2;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    bias_f = read_imagef(bias, coord_para.xy);
+    scale_f = read_imagef(scale, coord_para.xy);
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_int4 tmpVal0, tmpVal1;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
+    float alpha = inOut_fl_scale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;
+
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
+    norm = tmpData0 * alpha + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = tmpData1 * alpha + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    norm = tmpData2 * alpha + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = tmpData3 * alpha + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toI8_2D(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidz = get_global_id(1);
+    int2 coord = (int2)(get_global_id(0), gidz);
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
+    vxc_char16 src0, src2;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    bias_f = read_imagef(bias, coord_para.xy);
+    scale_f = read_imagef(scale, coord_para.xy);
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    vxc_int4 tmpVal0, tmpVal1;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
+    float alpha = inOut_fl_scale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;
+
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);
+    norm = tmpData0 * alpha + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = tmpData1 * alpha + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    norm = tmpData2 * alpha + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = tmpData3 * alpha + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8.vx
index af20584..c08a996 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8.vx
@@ -243,6 +243,87 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toU8_
     float alpha = scale_inOut * scale_vari;
     bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;
 
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
+    norm = tmpData0 * alpha + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = tmpData1 * alpha + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    norm = tmpData2 * alpha + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = tmpData3 * alpha + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toU8(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
+    vxc_uchar16 src0, src2;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    bias_f = read_imagef(bias, coord_para.xy);
+    scale_f = read_imagef(scale, coord_para.xy);
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    short zp = inputZP;
+    vxc_int4 tmpVal0, tmpVal1;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
+    float alpha = scale_inOut * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;
+
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
+    norm = tmpData0 * alpha + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = tmpData1 * alpha + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    norm = tmpData2 * alpha + bias_val;
+    tmpVal0 = convert_int4_rte(norm);
+    norm = tmpData3 * alpha + bias_val;
+    tmpVal1 = convert_int4_rte(norm);
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toU8_2D(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidz = get_global_id(1);
+    int2 coord = (int2)(get_global_id(0), gidz);
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
+    vxc_uchar16 src0, src2;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    bias_f = read_imagef(bias, coord_para.xy);
+    scale_f = read_imagef(scale, coord_para.xy);
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    short zp = inputZP;
+    vxc_int4 tmpVal0, tmpVal1;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
+    float alpha = scale_inOut * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;
+
     VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
     VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
     VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8_f16.vx
index 3c1b892..a1f4ce0 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8_f16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/group_normalization_u8_f16.vx
@@ -112,3 +112,96 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toF16
     _viv_asm(COPY, outval, dst, 16);
     VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 }
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toF16(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidy = get_global_id(1);
+    int gidz = get_global_id(2);
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);
+    vxc_uchar16 src0;
+    vxc_short8 outval;
+    vxc_half8 dst;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    bias_f = read_imagef(bias, coord_para.xy);
+    scale_f = read_imagef(scale, coord_para.xy);
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
+    half4 tmpVal0, tmpVal1;
+    float alpha = input_scale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
+
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
+    norm = alpha * tmpData0 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData1 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+    coord.x += 8;
+    norm = alpha * tmpData2 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData3 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));
+}
+
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toF16_2D(
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)
+{
+    int gidz = get_global_id(1);
+    int2 coord = (int2)(get_global_id(0), gidz);
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);
+    vxc_uchar16 src0;
+    vxc_short8 outval;
+    vxc_half8 dst;
+    float scale_vari, bias_val;
+    vxc_float4 bias_f, scale_f;
+
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);
+    bias_f = read_imagef(bias, coord_para.xy);
+    scale_f = read_imagef(scale, coord_para.xy);
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    scale_vari = scale_f.s0 * mean_vari.s1;
+    short zp = inputZP;
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;
+    half4 tmpVal0, tmpVal1;
+    float alpha = input_scale * scale_vari;
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);
+
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);
+    norm = alpha * tmpData0 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData1 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    coord.x += 8;
+    norm = alpha * tmpData2 + bias_val;
+    _viv_asm(CONV, tmpVal0, norm);
+    norm = alpha * tmpData3 + bias_val;
+    _viv_asm(CONV, tmpVal1, norm);
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);
+    _viv_asm(COPY, outval, dst, 16);
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx
new file mode 100644
index 0000000..c1266fc
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_activation_z_h.vx
@@ -0,0 +1,126 @@
+#include "cl_viv_vx_ext.h"
+
+#define logE        (1.44269502f)
+#define twoLogE     (logE * 2.0f)
+
+float4 sigmoid_func(float4 x)
+{
+    x *= -logE;
+    x = 1.0f + exp2(x);
+    return 1.0f / x;
+}
+float4 hard_sigmoid(float4 x)
+{
+    x = 0.2 * x + 0.5;
+    x = clamp(x, 0, 1);
+    return x;
+}
+float4 tanh_func(float4 x)
+{
+    x *= -twoLogE;
+    x = 1 + exp2(x);
+    x = 1.0f / x;
+    return 2 * x - 1;
+}
+
+_viv_uniform VXC_512Bits uniF16PlusF16_0_4x4;
+_viv_uniform VXC_512Bits uniF16PlusF16_1_4x4;
+_viv_uniform VXC_512Bits uniConvertF16_0_4x4;
+_viv_uniform VXC_512Bits uniConvertF16_1_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+
+#define GRUCELL_F16_F16TOF16(act_name, act_func) \
+__kernel void grucell_activation_z_h_F16_F16toF16_##act_name( \
+    __read_only  image2d_t hstate_in, \
+    __read_only  image2d_t input_z_conv, \
+    __read_only  image2d_t input_h_conv, \
+    __read_only  image2d_t hstate_z_conv, \
+    __read_only  image2d_t hstate_h_conv, \
+    __write_only image2d_t output, \
+    __write_only image2d_t hstate_out \
+    ) \
+{ \
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
+    vxc_short8 v0, v1, v2, v3, v4, v5, v6; \
+    vxc_half8 src0, src1, src2, src3, src4, src5, src6; \
+    VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src2, v2, 16); \
+    VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src4, v4, 16); \
+    VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src5, v5, 16); \
+    VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src6, v6, 16); \
+    VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src3, v3, 16); \
+ \
+    float4 h; \
+    VXC_DP4x4(h, src2, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \
+    float4 z; \
+    VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \
+    z = act_func(z); \
+    h = tanh_func(h); \
+    float4 h_tm; \
+    VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \
+    float4 result = (1 - z) * h + z * h_tm; \
+    half4 dst0; \
+    _viv_asm(CONV_RTE, dst0, result); \
+    vxc_half4 dst1; \
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    vxc_short4 dst; \
+    _viv_asm(COPY, dst, dst1, 8); \
+    VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func)
+
+_viv_uniform float hstate_in_scale;
+_viv_uniform float hstate_in_tail;
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+#define GRUCELL_QNT_F16TO_QNT(name0, name1, act_name, act_func, src0_type, dst_type) \
+__kernel void grucell_activation_z_h_##name0##_F16to##name1##_##act_name( \
+    __read_only  image2d_t hstate_in, \
+    __read_only  image2d_t input_z_conv, \
+    __read_only  image2d_t input_h_conv, \
+    __read_only  image2d_t hstate_z_conv, \
+    __read_only  image2d_t hstate_h_conv, \
+    __write_only image2d_t output, \
+    __write_only image2d_t hstate_out \
+    ) \
+{ \
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
+    src0_type src3; \
+    vxc_short8 v0, v1, v2, v3, v4, v5, v6; \
+    vxc_half8 src0, src1, src2, src4, src5, src6; \
+    VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src2, v2, 16); \
+    VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src4, v4, 16); \
+    VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src5, v5, 16); \
+    VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src6, v6, 16); \
+    VXC_ReadImage(src3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    float4 h; \
+    VXC_DP4x4(h, src2, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \
+    float4 z; \
+    VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \
+    z = act_func(z); \
+    h = tanh_func(h); \
+    float4 h_tm; \
+    VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \
+    h_tm = h_tm * hstate_in_scale + hstate_in_tail; \
+    float4 result = (1 - z) * h + z * h_tm; \
+    result = result * output_scale + output_zp; \
+    int4 dst0; \
+    _viv_asm(CONV_RTE, dst0, result); \
+    dst_type dst; \
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+GRUCELL_QNT_F16TO_QNT(U8,  U8,  SIGMOID, sigmoid_func, vxc_uchar8, vxc_uchar8)
+GRUCELL_QNT_F16TO_QNT(I8,  I8,  SIGMOID, sigmoid_func, vxc_char8,  vxc_char8)
+GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_h_times_activation_r.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_h_times_activation_r.vx
new file mode 100644
index 0000000..a9c8d44
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_h_times_activation_r.vx
@@ -0,0 +1,96 @@
+#include "cl_viv_vx_ext.h"
+
+#define logE        (1.44269502f)
+#define twoLogE     (logE * 2.0f)
+
+float4 sigmoid_func(float4 x)
+{
+    x *= -logE;
+    x = 1.0f + exp2(x);
+    return 1.0f / x;
+}
+float4 hard_sigmoid(float4 x)
+{
+    x = 0.2 * x + 0.5;
+    x = clamp(x, 0, 1);
+    return x;
+}
+
+_viv_uniform VXC_512Bits uniF16PlusF16_0_4x4;
+_viv_uniform VXC_512Bits uniF16PlusF16_1_4x4;
+_viv_uniform VXC_512Bits uniConvertF16_0_4x4;
+_viv_uniform VXC_512Bits uniConvertF16_1_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+
+#define GRUCELL_F16_F16TOF16(act_name, act_func) \
+__kernel void grucell_h_times_activation_r_F16_F16toF16_##act_name( \
+    __read_only  image2d_t hstate_in, \
+    __read_only  image2d_t input_r_conv, \
+    __read_only  image2d_t hstate_r_conv, \
+    __write_only image2d_t output \
+    ) \
+{ \
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
+    vxc_short8 v0, v1, v2, v3, v4, v5, v6; \
+    vxc_half8 src0, src1, src2, src3, src4, src5, src6; \
+    VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src0, v0, 16); \
+    VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src1, v1, 16); \
+    VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src3, v3, 16); \
+ \
+    float4 r; \
+    VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \
+    r = act_func(r); \
+    float4 h_tm; \
+    VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \
+    float4 result = r * h_tm; \
+    half4 dst0; \
+    _viv_asm(CONV_RTE, dst0, result); \
+    vxc_half4 dst1; \
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    vxc_short4 dst; \
+    _viv_asm(COPY, dst, dst1, 8); \
+    VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func)
+
+_viv_uniform float hstate_in_scale;
+_viv_uniform float hstate_in_tail;
+#define GRUCELL_QNT_F16TO_F16(name0, act_name, act_func, src0_type) \
+__kernel void grucell_h_times_activation_r_##name0##_F16toF16_##act_name( \
+    __read_only  image2d_t hstate_in, \
+    __read_only  image2d_t input_r_conv, \
+    __read_only  image2d_t hstate_r_conv, \
+    __write_only image2d_t output \
+    ) \
+{ \
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
+    src0_type src3; \
+    vxc_short8 v0, v1, v2, v3, v4, v5, v6; \
+    vxc_half8 src0, src1, src2, src4, src5, src6; \
+    VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src0, v0, 16); \
+    VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src1, v1, 16); \
+    VXC_ReadImage(src3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    float4 r; \
+    VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \
+    r = act_func(r); \
+    float4 h_tm; \
+    VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \
+    h_tm = h_tm * hstate_in_scale + hstate_in_tail; \
+    float4 result = r * h_tm; \
+    half4 dst0; \
+    _viv_asm(CONV_RTE, dst0, result); \
+    vxc_half8 dst1; \
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    vxc_short4 dst; \
+    _viv_asm(COPY, dst, dst1, 8); \
+    VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+GRUCELL_QNT_F16TO_F16(U8,  SIGMOID, sigmoid_func, vxc_uchar8)
+GRUCELL_QNT_F16TO_F16(I8,  SIGMOID, sigmoid_func, vxc_char8)
+GRUCELL_QNT_F16TO_F16(I16, SIGMOID, sigmoid_func, vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx
new file mode 100644
index 0000000..77fdcc9
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/grucell_reset_after_activation.vx
@@ -0,0 +1,148 @@
+#include "cl_viv_vx_ext.h"
+
+#define logE        (1.44269502f)
+#define twoLogE     (logE * 2.0f)
+
+float4 sigmoid_func(float4 x)
+{
+    x *= -logE;
+    x = 1.0f + exp2(x);
+    return 1.0f / x;
+}
+float4 hard_sigmoid(float4 x)
+{
+    x = 0.2 * x + 0.5;
+    x = clamp(x, 0, 1);
+    return x;
+}
+float4 tanh_func(float4 x)
+{
+    x *= -twoLogE;
+    x = 1 + exp2(x);
+    x = 1.0f / x;
+    return 2 * x - 1;
+}
+
+_viv_uniform VXC_512Bits uniF16PlusF16_0_4x4;
+_viv_uniform VXC_512Bits uniF16PlusF16_1_4x4;
+_viv_uniform VXC_512Bits uniConvertF16_0_4x4;
+_viv_uniform VXC_512Bits uniConvertF16_1_4x4;
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;
+
+#define GRUCELL_F16_F16TOF16(act_name, act_func) \
+__kernel void grucell_reset_after_activation_F16_F16toF16_##act_name( \
+    __read_only  image2d_t hstate_in, \
+    __read_only  image2d_t input_z_conv, \
+    __read_only  image2d_t input_r_conv, \
+    __read_only  image2d_t input_h_conv, \
+    __read_only  image2d_t hstate_z_conv, \
+    __read_only  image2d_t hstate_r_conv, \
+    __read_only  image2d_t hstate_h_conv, \
+    __write_only image2d_t output, \
+    __write_only image2d_t hstate_out \
+    ) \
+{ \
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
+    vxc_short8 v0, v1, v2, v3, v4, v5, v6; \
+    vxc_half8 src0, src1, src2, src3, src4, src5, src6; \
+    VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src0, v0, 16); \
+    VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src1, v1, 16); \
+    VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src2, v2, 16); \
+    VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src4, v4, 16); \
+    VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src5, v5, 16); \
+    VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src6, v6, 16); \
+    VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src3, v3, 16); \
+ \
+    float4 r; \
+    VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \
+    r = act_func(r); \
+    float4 h0, h1; \
+    VXC_DP4x4(h1, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \
+    VXC_DP4x4(h0, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \
+    float4 h = h0 + r * h1; \
+    float4 z; \
+    VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \
+    z = act_func(z); \
+    h = tanh_func(h); \
+    float4 h_tm; \
+    VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \
+    float4 result = (1 - z) * h + z * h_tm; \
+    half4 dst0; \
+    _viv_asm(CONV_RTE, dst0, result); \
+    vxc_half4 dst1; \
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    vxc_short4 dst; \
+    _viv_asm(COPY, dst, dst1, 8); \
+    VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func)
+
+_viv_uniform float hstate_in_scale;
+_viv_uniform float hstate_in_tail;
+_viv_uniform float output_scale;
+_viv_uniform float output_zp;
+#define GRUCELL_QNT_F16TO_QNT(name0, name1, act_name, act_func, src0_type, dst_type) \
+__kernel void grucell_reset_after_activation_##name0##_F16to##name1##_##act_name( \
+    __read_only  image2d_t hstate_in, \
+    __read_only  image2d_t input_z_conv, \
+    __read_only  image2d_t input_r_conv, \
+    __read_only  image2d_t input_h_conv, \
+    __read_only  image2d_t hstate_z_conv, \
+    __read_only  image2d_t hstate_r_conv, \
+    __read_only  image2d_t hstate_h_conv, \
+    __write_only image2d_t output, \
+    __write_only image2d_t hstate_out \
+    ) \
+{ \
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \
+    src0_type src3; \
+    vxc_short8 v0, v1, v2, v3, v4, v5, v6; \
+    vxc_half8 src0, src1, src2, src4, src5, src6; \
+    VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src0, v0, 16); \
+    VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src1, v1, 16); \
+    VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src2, v2, 16); \
+    VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src4, v4, 16); \
+    VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src5, v5, 16); \
+    VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, src6, v6, 16); \
+    VXC_ReadImage(src3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+ \
+    float4 r; \
+    VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \
+    r = act_func(r); \
+    float4 h0, h1; \
+    VXC_DP4x4(h1, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \
+    VXC_DP4x4(h0, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \
+    float4 h = h0 + r * h1; \
+    float4 z; \
+    VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \
+    z = act_func(z); \
+    h = tanh_func(h); \
+    float4 h_tm; \
+    VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \
+    h_tm = h_tm * hstate_in_scale + hstate_in_tail; \
+    float4 result = (1 - z) * h + z * h_tm; \
+    result = result * output_scale + output_zp; \
+    int4 dst0; \
+    _viv_asm(CONV_RTE, dst0, result); \
+    dst_type dst; \
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \
+    VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \
+}
+GRUCELL_QNT_F16TO_QNT(U8,  U8,  SIGMOID, sigmoid_func, vxc_uchar8, vxc_uchar8)
+GRUCELL_QNT_F16TO_QNT(I8,  I8,  SIGMOID, sigmoid_func, vxc_char8,  vxc_char8)
+GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx
index a90f1ff..c358585 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/l2normalizescale_axis0.vx
@@ -254,9 +254,9 @@ L2NORMSCALE_AXIS0_2D(I8,  F16, I8,  char,   vxc_char8,  vxc_char8, r_inputScale,
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \
      void l2normalizescale_axis0_U8_##in1_name##to##out_name##_2D \
     (\
-    __read_only  image2d_array_t input,\
-    __read_only  image2d_array_t scale,\
-    __write_only image2d_array_t output,\
+    __read_only  image2d_t input,\
+    __read_only  image2d_t scale,\
+    __write_only image2d_t output,\
     int axis\
     )\
 { \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray_2.vx
new file mode 100644
index 0000000..3396163
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray_2.vx
@@ -0,0 +1,69 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniBilinear_4over3_l00_2x8;
+_viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8;
+_viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4;
+_viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4;
+_viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;
+
+__kernel void pre_process_gray_4over3_U8toU8
+    (
+    __read_only image2d_array_t  input,
+    __write_only image2d_array_t output,
+        global int               *xRatio,
+        global int               *yRatio,
+        global int               *xOffset,
+        global int               *yOffset,
+               float             mean,
+               float             f32Var
+    )
+{
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
+
+    vxc_uchar16 src0, src1, src2, src3;
+
+    VXC_ReadImage(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    coord_in.xy = (coord_in.xy >> 2) * 3;
+    coord_in.zw = coord_in.yy + (int2)(1, 2);
+
+    vxc_uchar16 dst0, dst1, dst2;
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);
+
+    VXC_WriteImage(output, coord_in.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_in.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_in.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+}
+
+__kernel void pre_process_gray_half_U8toU8
+    (
+    __read_only image2d_array_t  input,
+    __write_only image2d_array_t output,
+        global int               *xRatio,
+        global int               *yRatio,
+        global int               *xOffset,
+        global int               *yOffset,
+               float             mean,
+               float             f32Var
+    )
+{
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
+
+    vxc_uchar16 src0;
+
+    VXC_ReadImage(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    coord_in.xy = coord_in.xy >> 1;
+
+    VXC_WriteImage(output, coord_in.xy, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_nhwc.vx b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_nhwc.vx
new file mode 100644
index 0000000..f0b3417
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/resize_bilinear_nhwc.vx
@@ -0,0 +1,204 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniResize_x2_nhwc2_0_4x8;
+_viv_uniform VXC_512Bits uniResize_x2_nhwc2_1_4x8;
+_viv_uniform int out_height;
+
+__kernel void resize_bilinear_nhwc_U8toU8_2x_upsample_half_pixel_centers
+    (
+    __read_only  image2d_array_t   input,
+    __write_only image2d_array_t   output,
+                             int   align_corners,
+                             int   half_pixel_centers
+    )
+{
+    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), 0);
+    int4 coord_in   = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), 0);
+    coord_in.x = ((coord_out.x * 2 - 1) >> 2) - 1;
+    coord_in.y = ((coord_out.y * 2 - 1) >> 2);
+    coord_in.x  = coord_out.x == 0 ? -2 : coord_in.x;
+    coord_in.y  = coord_out.y == 0 ? -1 : coord_in.y;
+
+    vxc_uchar16 in0, in1, in2, in3, result;
+
+    VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(in3, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);
+    VXC_DP4x8(result, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_1_4x8);
+    VXC_WriteImage(output, coord_out, result, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    VXC_DP4x8(result, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);
+    VXC_DP4x8(result, in2, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_1_4x8);
+    VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    VXC_DP4x8(result, in1, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);
+    VXC_DP4x8(result, in1, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_1_4x8);
+    VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    coord_out.y++;
+    VXC_DP4x8(result, in3, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);
+    VXC_DP4x8(result, in3, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_1_4x8);
+    VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+}
+
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l00_2x8;
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l01_2x8;
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l02_2x8;
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l03_2x8;
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l10_4x4;
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l11_4x4;
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l12_4x4;
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l13_4x4;
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l14_4x4;
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l15_4x4;
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l16_4x4;
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l17_4x4;
+__kernel void resize_bilinear_nhwc_U8toU8_3x_upsample_half_pixel_centers
+    (
+    __read_only  image2d_array_t   input,
+    __write_only image2d_array_t   output,
+                             int   align_corners,
+                             int   half_pixel_centers
+    )
+{
+    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+    int4 coord_in   = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    coord_in.x = (short)(coord_out.x - 1) / (short)6  * 2;
+    coord_in.x  = coord_out.x == 0 ? -2 : coord_in.x;
+    coord_in.y = (short)(coord_out.y * 2 - 1) / (short)6;
+    coord_in.y  = coord_out.y == 0 ? -1 : coord_in.y;
+
+    vxc_uchar16 in0, in1, in2, in3, dst0, dst1, dst2;
+
+    VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(in3, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    coord_out.zw = coord_out.xy + (int2)(16, 1);
+
+    VXC_DP4x4(dst0, in1, in0, VXC_MODIFIER(0,   3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);
+    VXC_DP4x4(dst0, in1, in0, VXC_MODIFIER(4,   7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l11_4x4);
+    VXC_DP4x4(dst0, in1, in0, VXC_MODIFIER(8,  11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l12_4x4);
+    VXC_DP4x4(dst0, in1, in0, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l13_4x4);
+    VXC_DP4x4(dst1, in1, in0, VXC_MODIFIER(0,   3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l14_4x4);
+    VXC_DP4x4(dst1, in1, in0, VXC_MODIFIER(4,   7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l15_4x4);
+    VXC_DP4x4(dst1, in1, in0, VXC_MODIFIER(8,  11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l16_4x4);
+    VXC_DP4x4(dst1, in1, in0, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l17_4x4);
+
+    VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_out.zy, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0));
+
+    VXC_DP2x8(dst0, in1, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize_x3_nhwc2_l00_2x8);
+    VXC_DP2x8(dst0, in1, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),  uniResize_x3_nhwc2_l01_2x8);
+    VXC_DP2x8(dst1, in1, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize_x3_nhwc2_l02_2x8);
+    VXC_DP2x8(dst1, in1, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),  uniResize_x3_nhwc2_l03_2x8);
+    VXC_WriteImage(output, coord_out.xw, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_out.zw, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0));
+    coord_out.yw += 2;
+
+    VXC_DP4x4(dst0, in1, in2, VXC_MODIFIER(0,   3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);
+    VXC_DP4x4(dst0, in1, in2, VXC_MODIFIER(4,   7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l11_4x4);
+    VXC_DP4x4(dst0, in1, in2, VXC_MODIFIER(8,  11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l12_4x4);
+    VXC_DP4x4(dst0, in1, in2, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l13_4x4);
+    VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(0,   3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l14_4x4);
+    VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(4,   7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l15_4x4);
+    VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(8,  11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l16_4x4);
+    VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l17_4x4);
+    VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_out.zy, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0));
+
+    VXC_DP4x4(dst0, in2, in1, VXC_MODIFIER(0,   3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);
+    VXC_DP4x4(dst0, in2, in1, VXC_MODIFIER(4,   7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l11_4x4);
+    VXC_DP4x4(dst0, in2, in1, VXC_MODIFIER(8,  11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l12_4x4);
+    VXC_DP4x4(dst0, in2, in1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l13_4x4);
+    VXC_DP4x4(dst1, in2, in1, VXC_MODIFIER(0,   3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l14_4x4);
+    VXC_DP4x4(dst1, in2, in1, VXC_MODIFIER(4,   7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l15_4x4);
+    VXC_DP4x4(dst1, in2, in1, VXC_MODIFIER(8,  11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l16_4x4);
+    VXC_DP4x4(dst1, in2, in1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l17_4x4);
+    VXC_WriteImage(output, coord_out.xw, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_out.zw, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0));
+    coord_out.yw += 2;
+
+    VXC_DP2x8(dst0, in2, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize_x3_nhwc2_l00_2x8);
+    VXC_DP2x8(dst0, in2, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),  uniResize_x3_nhwc2_l01_2x8);
+    VXC_DP2x8(dst1, in2, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize_x3_nhwc2_l02_2x8);
+    VXC_DP2x8(dst1, in2, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),  uniResize_x3_nhwc2_l03_2x8);
+    VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_out.zy, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0));
+
+    VXC_DP4x4(dst0, in2, in3, VXC_MODIFIER(0,   3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);
+    VXC_DP4x4(dst0, in2, in3, VXC_MODIFIER(4,   7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l11_4x4);
+    VXC_DP4x4(dst0, in2, in3, VXC_MODIFIER(8,  11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l12_4x4);
+    VXC_DP4x4(dst0, in2, in3, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l13_4x4);
+    VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(0,   3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l14_4x4);
+    VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(4,   7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l15_4x4);
+    VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(8,  11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l16_4x4);
+    VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l17_4x4);
+    VXC_WriteImage(output, coord_out.xw, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_out.zw, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0));
+}
+
+_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l00_4x8;
+_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l01_4x8;
+_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l10_4x8;
+_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l11_4x8;
+__kernel void resize_bilinear_nhwc_U8toU8_4x_upsample_half_pixel_centers
+    (
+    __read_only  image2d_array_t   input,
+    __write_only image2d_array_t   output,
+                             int   align_corners,
+                             int   half_pixel_centers
+     )
+{
+    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));
+    int4 coord_in   = (int4)(get_global_id(0), -1, 0, 0);
+    coord_in.x = ((coord_out.x - 3) >> 3) * 2;
+    coord_in.y = (coord_out.y * 2 - 3) >> 3;
+    coord_in.x  = coord_out.x == 0 ? -2 : coord_in.x;
+    coord_in.y  = coord_out.y == 0 ? -1 : coord_in.y;
+
+    vxc_uchar16 in0, in1, in2, in3, dst0, dst1;
+
+    VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+    VXC_ReadImage(in3, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));
+
+    coord_out.zw = coord_out.yy + (int2)(1, 2);
+
+    VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8);
+    VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l01_4x8);
+    VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8);
+    VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l11_4x8);
+    VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+
+    coord_out.yz = coord_out.yz + (int2)(3, 3);
+
+    VXC_DP4x8(dst0, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8);
+    VXC_DP4x8(dst0, in2, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l11_4x8);
+    VXC_DP4x8(dst1, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize_x4_nhwc2_l00_4x8);
+    VXC_DP4x8(dst1, in2, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l01_4x8);
+    VXC_WriteImage(output, coord_out.xw, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_out.xy, dst1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+
+    coord_out.yw = coord_out.yw + (int2)(3, 3);
+
+    VXC_DP4x8(dst0, in1, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8);
+    VXC_DP4x8(dst0, in1, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l01_4x8);
+    VXC_DP4x8(dst1, in1, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize_x4_nhwc2_l10_4x8);
+    VXC_DP4x8(dst1, in1, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l11_4x8);
+    VXC_WriteImage(output, coord_out.xz, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_out.xw, dst1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+
+    coord_out.zw = coord_out.zw + (int2)(3, 3);
+
+    VXC_DP4x8(dst0, in3, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8);
+    VXC_DP4x8(dst0, in3, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l11_4x8);
+    VXC_DP4x8(dst1, in3, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize_x4_nhwc2_l00_4x8);
+    VXC_DP4x8(dst1, in3, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l01_4x8);
+    VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update.vx
index 8532ae0..13cee71 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update.vx
@@ -60,6 +60,63 @@ __kernel void scatter_nd_update_F16F16toF16(
     VXC_WriteImage(output, coord, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 }
 
+__kernel void scatter_nd_update_F16F16toU8(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __read_only image2d_t   input2,
+    image2d_array_t  output,
+    int width,
+    int area,
+    int vol,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int cnt = 0;
+
+    vxc_short8 tmpVal = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+    vxc_half8 sum;
+    _viv_asm(COPY, sum, tmpVal, 16);
+    Image img1 = create_image_from_image2d(input1, 4);
+    __global int* index_ptr = (__global int*)img1.ptr;
+    for(int i = 0; i < index_num; i++)
+    {
+        //int4 indice = read_imagei(input1, (int2)(0, i));
+        int4 indice = vload4(0, index_ptr + offset_idx);
+        index_ptr += coord_dim;
+        int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW;
+        if(gidy == idx)
+        {
+            vxc_half8 src;
+            VXC_ReadImage(tmpVal, input2, (int2)(gidx, i), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+            cnt++;
+            _viv_asm(COPY, src, tmpVal, 16);
+            VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8);
+        }
+    }
+    int2 coord = (int2)(gidx, gidy);
+    vxc_ushort8 ms0;
+    vxc_uchar8 dst;
+    if(cnt == 0)
+    {
+        vxc_half8 src;
+        VXC_ReadImage(tmpVal, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        _viv_asm(COPY, ms0, multAndoutZP0, 16);
+        _viv_asm(COPY, src, tmpVal, 16);
+        VXC_DP2x8(dst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
+                    uniU8MulAndPostShift_0_Lo_2x8);
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+    else
+    {
+        _viv_asm(COPY, ms0, multAndoutZP1, 16);
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
+                    uniU8MulAndPostShift_1_Lo_2x8);
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    }
+}
+
 #define SCATTER_ND_UPDATE_QINT(src0_type_name, src2_type_name, out_type_name, data_type) \
 __kernel void scatter_nd_update_##src0_type_name##src2_type_name##to##out_type_name##( \
     __read_only image2d_t   input0, \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_big.vx b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_big.vx
index 122fddb..5a0c5ce 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_big.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/scatter_nd_update_big.vx
@@ -11,6 +11,11 @@ _viv_uniform int offsetZ;
 _viv_uniform int offsetW;
 _viv_uniform int offset_idx;
 
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_1_Lo_2x8;
+_viv_uniform int2 multAndoutZP0;
+_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp
+
 __kernel void scatter_nd_update_F16F16toF16_big(
     __read_only image2d_t   input0,
     __read_only image2d_t   input1,
@@ -62,3 +67,67 @@ __kernel void scatter_nd_update_F16F16toF16_big(
     }
     output_ptr[loc] = dst;
 }
+
+__kernel void scatter_nd_update_F16F16toU8_big(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __read_only image2d_t   input2,
+    image2d_t  output,
+    int width,
+    int area,
+    int vol,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);
+    int gidy = get_global_id(1);
+    int cnt = 0;
+
+    vxc_short8 tmpVal = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+    vxc_half8 sum;
+    _viv_asm(COPY, sum, tmpVal, 16);
+    Image img1 = create_image_from_image2d(input1, 4);
+    Image img2 = create_image_from_image2d(input2, 2);
+    Image img3 = create_image_from_image2d(output, 1);
+
+    __global int* index_ptr = (__global int*)img1.ptr;
+    __global short* update_ptr = (__global short*)img2.ptr;
+    __global uchar* output_ptr = (__global uchar*)img3.ptr;
+    for(int i = 0; i < index_num; i++)
+    {
+        int4 indice = vload4(0, index_ptr + offset_idx);
+        index_ptr += coord_dim;
+
+        int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW;
+        if(gidy == idx)
+        {
+            vxc_half8 src;
+            short tmpData = update_ptr[i * update_width + gidx];
+            cnt++;
+            _viv_asm(COPY, src, tmpData, 4);
+            VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8);
+        }
+    }
+    short dst;
+    vxc_ushort8 ms0;
+    int loc = gidy * output_width+ gidx;
+    if(cnt == 0)
+    {
+        vxc_half8 src;
+        Image img0 = create_image_from_image2d(input0, 2);
+        __global short* ref_ptr = (__global short*)img0.ptr;
+        short tmpData = ref_ptr[loc];
+        _viv_asm(COPY, ms0, multAndoutZP0, 16);
+        _viv_asm(COPY, src, tmpData, 4);
+        VXC_DP2x8(dst, src, ms0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1),
+                    uniU8MulAndPostShift_0_Lo_2x8);
+        output_ptr[loc] = dst;
+    }
+    else
+    {
+        _viv_asm(COPY, ms0, multAndoutZP1, 16);
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1),
+                    uniU8MulAndPostShift_1_Lo_2x8);
+        output_ptr[loc] = dst;
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/tile.vx b/src/tim/vx/internal/src/libnnext/ops/vx/tile.vx
index 7fd4c58..70c303b 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/tile.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/tile.vx
@@ -46,11 +46,15 @@ __kernel void tile_remain##name2##_##name0##to##name1( \
                 { \
                     coord_out.x = coord.x + x * width; \
                     if (isLastItem) \
+                    { \
                         VXC_WriteImage2DArray(output, coord_out, src, \
                             VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \
+                    } \
                     else \
+                    { \
                         VXC_WriteImage2DArray(output, coord_out, src, \
                             VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+                    } \
                 } \
             } \
         } \
@@ -103,9 +107,13 @@ __kernel void tile_remain##name2##_##name0##to##name1##_2D( \
         do \
         { \
             if (isLastItem) \
+            { \
                 VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \
+            } \
             else \
+            { \
                 VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            } \
             coord.x += width; \
         } while (coord.x < output_width); \
         coord.x = get_global_id(0); \
@@ -165,6 +173,3 @@ __kernel void tile_1toN_##name0##to##name1##_2D( \
 }
 TILE_2D_1TON(U8,  U8, vxc_uchar8)
 TILE_2D_1TON(I16, I16, vxc_short8)
-
-
-
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/tile_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/tile_mix.vx
index b23c1cd..dba960c 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/tile_mix.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/tile_mix.vx
@@ -57,11 +57,15 @@ __kernel void tile_remain##name2##_##name0##to##name1( \
                 { \
                     coord_out.x = coord.x + x * width; \
                     if (isLastItem) \
+                    { \
                         VXC_WriteImage2DArray(output, coord_out, dst, \
                             VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \
+                    } \
                     else \
+                    { \
                         VXC_WriteImage2DArray(output, coord_out, dst, \
                             VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+                    } \
                 } \
             } \
         } \
@@ -114,9 +118,13 @@ __kernel void tile_remain##name2##_##name0##to##name1##_2D( \
         do \
         { \
             if (isLastItem) \
+            { \
                 VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \
+            } \
             else \
+            { \
                 VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            } \
             coord.x += width; \
         } while (coord.x < output_width); \
         coord.x = get_global_id(0); \
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx
index 99ac9fb..f6ccacc 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx
@@ -64,13 +64,13 @@ inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride
 {
     int8 desc;
     int2 strides;
+    _viv_asm(COPY, desc, input, sizeof(desc));
 #if (USE_40BITS_VA==0)
     strides.x = desc.s1;
     strides.y = desc.s4;
 #else
     _viv_asm(GET_IMAGE_STRIDE, strides, input);
 #endif
-    _viv_asm(COPY, desc, input, sizeof(desc));
     uint address = as_uint(desc.s0);
 
     Tensor t =
diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
index 06624a5..324dade 100644
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
@@ -4040,6 +4040,7 @@ __kernel void detect_post_box_U8_U8toF32(\n\
 static const char eltwise_unary_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform float alpha;\n\
+_viv_uniform float beta;\n\
 \n\
 float4 eltwise_unary_sin(float4 x)\n\
 {\n\
@@ -4077,7 +4078,7 @@ float4 eltwise_unary_neg(float4 x)\n\
 \n\
 float4 eltwise_unary_hard_sigmoid(float4 x)\n\
 {\n\
-    x = 0.2 * x + 0.5;\n\
+    x = alpha * x + beta;\n\
     x = clamp(x, 0, 1);\n\
     return x;\n\
 }\n\
@@ -4175,7 +4176,8 @@ _viv_uniform VXC_512Bits uniDatatoFp32Part1_4x4;\n\
     __read_only  image2d_array_t  input, \\\n\
     __write_only image2d_array_t  output, \\\n\
                  int              type, \\\n\
-                 float            _alpha \\\n\
+                 float            _alpha, \\\n\
+                 float            _beta \\\n\
     ) \\\n\
 { \\\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
@@ -4324,7 +4326,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
     __read_only  image2d_array_t  input, \\\n\
     __write_only image2d_array_t  output, \\\n\
                  int              type, \\\n\
-                 float            _alpha \\\n\
+                 float            _alpha, \\\n\
+                 float            _beta \\\n\
     ) \\\n\
 { \\\n\
     int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
@@ -4372,6 +4375,7 @@ ELTSISE_UNARY_BF16_2D(hard_gelu)\n\
 static const char eltwise_unary_3d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform float alpha;\n\
+_viv_uniform float beta;\n\
 \n\
 float4 eltwise_unary_sin(float4 x)\n\
 {\n\
@@ -4409,7 +4413,7 @@ float4 eltwise_unary_neg(float4 x)\n\
 \n\
 float4 eltwise_unary_hard_sigmoid(float4 x)\n\
 {\n\
-    x = 0.2 * x + 0.5;\n\
+    x = alpha * x + beta;\n\
     x = clamp(x, 0, 1);\n\
     return x;\n\
 }\n\
@@ -4507,7 +4511,8 @@ __kernel void func_name##_##src_type_name##to##dst_type_name( \\\n\
     __read_only  image2d_array_t  input, \\\n\
     __write_only image2d_array_t  output, \\\n\
                  int              type, \\\n\
-                 float            _alpha \\\n\
+                 float            _alpha, \\\n\
+                 float            _beta \\\n\
     ) \\\n\
 { \\\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
@@ -4655,7 +4660,8 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
     __read_only  image2d_array_t  input, \\\n\
     __write_only image2d_array_t  output, \\\n\
                  int              type, \\\n\
-                 float            _alpha \\\n\
+                 float            _alpha, \\\n\
+                 float            _beta \\\n\
     ) \\\n\
 { \\\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
@@ -6733,6 +6739,182 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16toU8
     VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 }"; /* end of group_normalization_f16_vx*/
 
+static const char group_normalization_f16_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertEndInt16Fp32_4x4;\n\
+\n\
+_viv_uniform float outputScale;\n\
+_viv_uniform int output_ZP;\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toF16(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\
+    float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
+    vxc_short8 src0;\n\
+    vxc_half8 in_h;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    scale_f = read_imagef(scale, coord_para.xy);\n\
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    vxc_short8 outval;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
+    vxc_half8 dst;\n\
+\n\
+    _viv_asm(COPY, in_h, src0, 16);\n\
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertEndInt16Fp32_4x4);\n\
+\n\
+    vxc_float4 norm;\n\
+    norm = scale_vari * tmpData0 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = scale_vari * tmpData1 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toF16_2D(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\
+    float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int2 coord = (int2)(get_global_id(0), gidz);\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
+    vxc_short8 src0;\n\
+    vxc_half8 in_h;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    scale_f = read_imagef(scale, coord_para.xy);\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    vxc_short8 outval;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
+    vxc_half8 dst;\n\
+\n\
+    _viv_asm(COPY, in_h, src0, 16);\n\
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        UniFP16toFP32Lo4_dp4x4);\n\
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertEndInt16Fp32_4x4);\n\
+    vxc_float4 norm;\n\
+    norm = scale_vari * tmpData0 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = scale_vari * tmpData1 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toU8(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\
+    float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
+    vxc_short8 src0;\n\
+    vxc_half8 in_h;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    scale_f = read_imagef(scale, coord_para.xy);\n\
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    vxc_uchar16 outval;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+    float alpha = outputScale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\
+\n\
+    _viv_asm(COPY, in_h, src0, 16);\n\
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertEndInt16Fp32_4x4);\n\
+\n\
+    vxc_float4 norm;\n\
+    norm = alpha * tmpData0 + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = alpha * tmpData1 + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+            uniConvertInt32toUint8_2x8);\n\
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_F16F32toU8_2D(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari, image2d_array_t output,\n\
+    float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int2 coord = (int2)(get_global_id(0), gidz);\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
+    vxc_short8 src0;\n\
+    vxc_half8 in_h;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    scale_f = read_imagef(scale, coord_para.xy);\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    vxc_uchar16 outval;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+    float alpha = outputScale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\
+\n\
+    _viv_asm(COPY, in_h, src0, 16);\n\
+    VXC_DP4x4(tmpData0, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            UniFP16toFP32Lo4_dp4x4);\n\
+    VXC_DP4x4(tmpData1, in_h, in_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+            uniConvertEndInt16Fp32_4x4);\n\
+    vxc_float4 norm;\n\
+    norm = alpha * tmpData0 + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = alpha * tmpData1 + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(outval, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+            uniConvertInt32toUint8_2x8);\n\
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of group_normalization_f16_scale_vx*/
+
 static const char group_normalization_i16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int width;\n\
@@ -7073,6 +7255,198 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16toI1
     VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 }"; /* end of group_normalization_i16_vx*/
 
+static const char group_normalization_i16_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
+\n\
+_viv_uniform float input_fl_scale;\n\
+_viv_uniform float inOut_fl_scale;\n\
+_viv_uniform float output_fl_scale;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertInt16Fp32Fst_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt16Fp32Secd_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toInt16_2x8;\n\
+\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toF16(\n\
+    image2d_array_t input,\n\
+    image2d_t bias,\n\
+    image2d_t scale,\n\
+    image2d_t meanVari,\n\
+    image2d_array_t output,\n\
+              float eps,\n\
+              int is2D,\n\
+              float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
+    vxc_short8 src0;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    scale_f = read_imagef(scale, coord_para.xy);\n\
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    vxc_short8 outval;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    float alpha = input_fl_scale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
+    vxc_half8 dst;\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertInt16Fp32Fst_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertInt16Fp32Secd_4x4);\n\
+\n\
+    vxc_float4 norm;\n\
+    norm = alpha * tmpData0 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData1 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toF16_2D(\n\
+    image2d_array_t input,\n\
+    image2d_t bias,\n\
+    image2d_t scale,\n\
+    image2d_t meanVari,\n\
+    image2d_array_t output,\n\
+              float eps,\n\
+              int is2D,\n\
+              float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int2 coord = (int2)(get_global_id(0), gidz);\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
+    vxc_short8 src0;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    scale_f = read_imagef(scale, coord_para.xy);\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    vxc_short8 outval;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    float alpha = input_fl_scale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
+    vxc_half8 dst;\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertInt16Fp32Fst_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertInt16Fp32Secd_4x4);\n\
+    vxc_float4 norm;\n\
+    norm = alpha * tmpData0 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData1 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toI16(\n\
+    image2d_array_t input,\n\
+    image2d_t bias,\n\
+    image2d_t scale,\n\
+    image2d_t meanVari,\n\
+    image2d_array_t output,\n\
+              float eps,\n\
+              int is2D,\n\
+              float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
+    vxc_short8 src0, src2;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    scale_f = read_imagef(scale, coord_para.xy);\n\
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    float alpha = inOut_fl_scale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertInt16Fp32Fst_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertInt16Fp32Secd_4x4);\n\
+    vxc_float4 norm;\n\
+    norm = tmpData0 * alpha + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = tmpData1 * alpha + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+        uniConvertInt32toInt16_2x8);\n\
+    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I16F32toI16_2D(\n\
+    image2d_array_t input,\n\
+    image2d_t bias,\n\
+    image2d_t scale,\n\
+    image2d_t meanVari,\n\
+    image2d_array_t output,\n\
+              float eps,\n\
+              int is2D,\n\
+              float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int2 coord = (int2)(get_global_id(0), gidz);\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
+    vxc_short8 src0, src2;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    scale_f = read_imagef(scale, coord_para.xy);\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+    vxc_float4  tmpData0, tmpData1;\n\
+    float alpha = inOut_fl_scale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertInt16Fp32Fst_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+        uniConvertInt16Fp32Secd_4x4);\n\
+    vxc_float4 norm;\n\
+    norm = tmpData0 * alpha + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = tmpData1 * alpha + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+            uniConvertInt32toInt16_2x8);\n\
+    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}"; /* end of group_normalization_i16_scale_vx*/
+
 static const char group_normalization_i8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int width;\n\
@@ -7392,6 +7766,194 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8toI8_
 }\n\
 "; /* end of group_normalization_i8_vx*/
 
+static const char group_normalization_i8_scale_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertHalfToFp16_2x8;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform float input_fl_scale;\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertDirInt8Fp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertEndInt8Fp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertTrdInt8Fp32_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertFthInt8Fp32_4x4;\n\
+\n\
+_viv_uniform float inOut_fl_scale;\n\
+_viv_uniform float output_fl_scale;\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toF16(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
+    vxc_char16 src0;\n\
+    vxc_short8 outval;\n\
+    vxc_half8 dst;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    scale_f = read_imagef(scale, coord_para.xy);\n\
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    float alpha = input_fl_scale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
+\n\
+    norm = alpha * tmpData0 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData1 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    coord.x += 8;\n\
+    norm = alpha * tmpData2 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData3 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toF16_2D(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int2 coord = (int2)(get_global_id(0), gidz);\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
+    vxc_char16 src0;\n\
+    vxc_short8 outval;\n\
+    vxc_half8 dst;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    scale_f = read_imagef(scale, coord_para.xy);\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\
+            VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    float alpha = input_fl_scale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
+    norm = alpha * tmpData0 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData1 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    coord.x += 8;\n\
+    norm = alpha * tmpData2 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData3 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toI8(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
+    vxc_char16 src0, src2;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    scale_f = read_imagef(scale, coord_para.xy);\n\
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
+    float alpha = inOut_fl_scale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
+    norm = tmpData0 * alpha + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = tmpData1 * alpha + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    norm = tmpData2 * alpha + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = tmpData3 * alpha + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_I8F32toI8_2D(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int2 coord = (int2)(get_global_id(0), gidz);\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
+    vxc_char16 src0, src2;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    scale_f = read_imagef(scale, coord_para.xy);\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
+    float alpha = inOut_fl_scale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * output_fl_scale;\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertDirInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertEndInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertTrdInt8Fp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, src0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertFthInt8Fp32_4x4);\n\
+    norm = tmpData0 * alpha + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = tmpData1 * alpha + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    norm = tmpData2 * alpha + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = tmpData3 * alpha + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of group_normalization_i8_scale_vx*/
+
 static const char group_normalization_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform int width;\n\
@@ -7652,6 +8214,87 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toU8_
     tmpVal1 = convert_int4_rte(norm);\n\
     VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
     VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toU8(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
+    vxc_uchar16 src0, src2;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    scale_f = read_imagef(scale, coord_para.xy);\n\
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    short zp = inputZP;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
+    float alpha = scale_inOut * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
+    norm = tmpData0 * alpha + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = tmpData1 * alpha + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    norm = tmpData2 * alpha + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = tmpData3 * alpha + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    VXC_WriteImage2DArray(output, coord, src2, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toU8_2D(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int2 coord = (int2)(get_global_id(0), gidz);\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
+    vxc_uchar16 src0, src2;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    scale_f = read_imagef(scale, coord_para.xy);\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    short zp = inputZP;\n\
+    vxc_int4 tmpVal0, tmpVal1;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
+    float alpha = scale_inOut * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0) * outputScale + output_ZP;\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
+    norm = tmpData0 * alpha + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = tmpData1 * alpha + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    norm = tmpData2 * alpha + bias_val;\n\
+    tmpVal0 = convert_int4_rte(norm);\n\
+    norm = tmpData3 * alpha + bias_val;\n\
+    tmpVal1 = convert_int4_rte(norm);\n\
+    VXC_DP2x8(src2, tmpVal0, tmpVal1, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+    VXC_WriteImage(output, coord.xy, src2, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
 }"; /* end of group_normalization_u8_vx*/
 
 static const char group_normalization_u8_f16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -7768,6 +8411,99 @@ __kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8toF16
     _viv_asm(COPY, outval, dst, 16);\n\
     VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toF16(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidy = get_global_id(1);\n\
+    int gidz = get_global_id(2);\n\
+    int4 coord = (int4)(get_global_id(0), gidy, gidz, 0);\n\
+    int4 coord_para = (int4)((convert_int(get_global_id(0) * rSpaceOrg) + gidy * pStride), gidz, 0, 0);\n\
+    vxc_uchar16 src0;\n\
+    vxc_short8 outval;\n\
+    vxc_half8 dst;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    scale_f = read_imagef(scale, coord_para.xy);\n\
+    VXC_ReadImage2DArray(src0, input, coord, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    short zp = inputZP;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    float alpha = input_scale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
+    norm = alpha * tmpData0 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData1 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+    coord.x += 8;\n\
+    norm = alpha * tmpData2 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData3 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage2DArray(output, coord, outval, VXC_MODIFIER(0, 7, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(16, 1, 1))) void group_norm_U8F32toF16_2D(\n\
+    image2d_array_t input, image2d_t bias, image2d_t scale, image2d_t meanVari,\n\
+    image2d_array_t output, float eps, int is2D, float rSpaceOrg, int pStride)\n\
+{\n\
+    int gidz = get_global_id(1);\n\
+    int2 coord = (int2)(get_global_id(0), gidz);\n\
+    int4 coord_para = (int4)(convert_int(get_global_id(0) * rSpaceOrg), gidz, 0, 0);\n\
+    vxc_uchar16 src0;\n\
+    vxc_short8 outval;\n\
+    vxc_half8 dst;\n\
+    float scale_vari, bias_val;\n\
+    vxc_float4 bias_f, scale_f;\n\
+\n\
+    vxc_float4 mean_vari = read_imagef(meanVari, coord_para.zy);\n\
+    bias_f = read_imagef(bias, coord_para.xy);\n\
+    scale_f = read_imagef(scale, coord_para.xy);\n\
+    VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    scale_vari = scale_f.s0 * mean_vari.s1;\n\
+    short zp = inputZP;\n\
+    vxc_float4  tmpData0, tmpData1, tmpData2, tmpData3, norm;\n\
+    half4 tmpVal0, tmpVal1;\n\
+    float alpha = input_scale * scale_vari;\n\
+    bias_val = (bias_f.s0 - scale_vari * mean_vari.s0);\n\
+\n\
+    VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert1stUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert2ndUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData2, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert3rdUint8SubZpToFp32_4x4);\n\
+    VXC_DP4x4(tmpData3, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvert4thUint8SubZpToFp32_4x4);\n\
+    norm = alpha * tmpData0 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData1 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    coord.x += 8;\n\
+    norm = alpha * tmpData2 + bias_val;\n\
+    _viv_asm(CONV, tmpVal0, norm);\n\
+    norm = alpha * tmpData3 + bias_val;\n\
+    _viv_asm(CONV, tmpVal1, norm);\n\
+    VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvertHalfToFp16_2x8);\n\
+    _viv_asm(COPY, outval, dst, 16);\n\
+    VXC_WriteImage(output, coord.xy, outval, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
 "; /* end of group_normalization_u8_f16_vx*/
 
 static const char grucell_activation_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -7943,6 +8679,134 @@ __kernel void grucell_activation_sma_F16_F16_F16toF16_2D\n\
 \n\
 "; /* end of grucell_activation_sma_vx*/
 
+static const char grucell_activation_z_h_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+#define logE        (1.44269502f)\n\
+#define twoLogE     (logE * 2.0f)\n\
+\n\
+float4 sigmoid_func(float4 x)\n\
+{\n\
+    x *= -logE;\n\
+    x = 1.0f + exp2(x);\n\
+    return 1.0f / x;\n\
+}\n\
+float4 hard_sigmoid(float4 x)\n\
+{\n\
+    x = 0.2 * x + 0.5;\n\
+    x = clamp(x, 0, 1);\n\
+    return x;\n\
+}\n\
+float4 tanh_func(float4 x)\n\
+{\n\
+    x *= -twoLogE;\n\
+    x = 1 + exp2(x);\n\
+    x = 1.0f / x;\n\
+    return 2 * x - 1;\n\
+}\n\
+\n\
+_viv_uniform VXC_512Bits uniF16PlusF16_0_4x4;\n\
+_viv_uniform VXC_512Bits uniF16PlusF16_1_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertF16_0_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertF16_1_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+\n\
+#define GRUCELL_F16_F16TOF16(act_name, act_func) \\\n\
+__kernel void grucell_activation_z_h_F16_F16toF16_##act_name( \\\n\
+    __read_only  image2d_t hstate_in, \\\n\
+    __read_only  image2d_t input_z_conv, \\\n\
+    __read_only  image2d_t input_h_conv, \\\n\
+    __read_only  image2d_t hstate_z_conv, \\\n\
+    __read_only  image2d_t hstate_h_conv, \\\n\
+    __write_only image2d_t output, \\\n\
+    __write_only image2d_t hstate_out \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    vxc_short8 v0, v1, v2, v3, v4, v5, v6; \\\n\
+    vxc_half8 src0, src1, src2, src3, src4, src5, src6; \\\n\
+    VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src2, v2, 16); \\\n\
+    VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src4, v4, 16); \\\n\
+    VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src5, v5, 16); \\\n\
+    VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src6, v6, 16); \\\n\
+    VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src3, v3, 16); \\\n\
+ \\\n\
+    float4 h; \\\n\
+    VXC_DP4x4(h, src2, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\
+    float4 z; \\\n\
+    VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\
+    z = act_func(z); \\\n\
+    h = tanh_func(h); \\\n\
+    float4 h_tm; \\\n\
+    VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\
+    float4 result = (1 - z) * h + z * h_tm; \\\n\
+    half4 dst0; \\\n\
+    _viv_asm(CONV_RTE, dst0, result); \\\n\
+    vxc_half4 dst1; \\\n\
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    vxc_short4 dst; \\\n\
+    _viv_asm(COPY, dst, dst1, 8); \\\n\
+    VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func)\n\
+\n\
+_viv_uniform float hstate_in_scale;\n\
+_viv_uniform float hstate_in_tail;\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
+#define GRUCELL_QNT_F16TO_QNT(name0, name1, act_name, act_func, src0_type, dst_type) \\\n\
+__kernel void grucell_activation_z_h_##name0##_F16to##name1##_##act_name( \\\n\
+    __read_only  image2d_t hstate_in, \\\n\
+    __read_only  image2d_t input_z_conv, \\\n\
+    __read_only  image2d_t input_h_conv, \\\n\
+    __read_only  image2d_t hstate_z_conv, \\\n\
+    __read_only  image2d_t hstate_h_conv, \\\n\
+    __write_only image2d_t output, \\\n\
+    __write_only image2d_t hstate_out \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    src0_type src3; \\\n\
+    vxc_short8 v0, v1, v2, v3, v4, v5, v6; \\\n\
+    vxc_half8 src0, src1, src2, src4, src5, src6; \\\n\
+    VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src2, v2, 16); \\\n\
+    VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src4, v4, 16); \\\n\
+    VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src5, v5, 16); \\\n\
+    VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src6, v6, 16); \\\n\
+    VXC_ReadImage(src3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 h; \\\n\
+    VXC_DP4x4(h, src2, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\
+    float4 z; \\\n\
+    VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\
+    z = act_func(z); \\\n\
+    h = tanh_func(h); \\\n\
+    float4 h_tm; \\\n\
+    VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\
+    h_tm = h_tm * hstate_in_scale + hstate_in_tail; \\\n\
+    float4 result = (1 - z) * h + z * h_tm; \\\n\
+    result = result * output_scale + output_zp; \\\n\
+    int4 dst0; \\\n\
+    _viv_asm(CONV_RTE, dst0, result); \\\n\
+    dst_type dst; \\\n\
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+GRUCELL_QNT_F16TO_QNT(U8,  U8,  SIGMOID, sigmoid_func, vxc_uchar8, vxc_uchar8)\n\
+GRUCELL_QNT_F16TO_QNT(I8,  I8,  SIGMOID, sigmoid_func, vxc_char8,  vxc_char8)\n\
+GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8)\n\
+"; /* end of grucell_activation_z_h_vx*/
+
 static const char grucell_cdnn_activation_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 #define logE     (1.44269502f)\n\
@@ -8733,6 +9597,254 @@ __kernel void grucell_activation_cdnn_U8_U8_U8_to_U8\n\
 \n\
 "; /* end of grucell_cdnn_activation_u8_vx*/
 
+static const char grucell_h_times_activation_r_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+#define logE        (1.44269502f)\n\
+#define twoLogE     (logE * 2.0f)\n\
+\n\
+float4 sigmoid_func(float4 x)\n\
+{\n\
+    x *= -logE;\n\
+    x = 1.0f + exp2(x);\n\
+    return 1.0f / x;\n\
+}\n\
+float4 hard_sigmoid(float4 x)\n\
+{\n\
+    x = 0.2 * x + 0.5;\n\
+    x = clamp(x, 0, 1);\n\
+    return x;\n\
+}\n\
+\n\
+_viv_uniform VXC_512Bits uniF16PlusF16_0_4x4;\n\
+_viv_uniform VXC_512Bits uniF16PlusF16_1_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertF16_0_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertF16_1_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+\n\
+#define GRUCELL_F16_F16TOF16(act_name, act_func) \\\n\
+__kernel void grucell_h_times_activation_r_F16_F16toF16_##act_name( \\\n\
+    __read_only  image2d_t hstate_in, \\\n\
+    __read_only  image2d_t input_r_conv, \\\n\
+    __read_only  image2d_t hstate_r_conv, \\\n\
+    __write_only image2d_t output \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    vxc_short8 v0, v1, v2, v3, v4, v5, v6; \\\n\
+    vxc_half8 src0, src1, src2, src3, src4, src5, src6; \\\n\
+    VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src0, v0, 16); \\\n\
+    VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src1, v1, 16); \\\n\
+    VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src3, v3, 16); \\\n\
+ \\\n\
+    float4 r; \\\n\
+    VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\
+    r = act_func(r); \\\n\
+    float4 h_tm; \\\n\
+    VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\
+    float4 result = r * h_tm; \\\n\
+    half4 dst0; \\\n\
+    _viv_asm(CONV_RTE, dst0, result); \\\n\
+    vxc_half4 dst1; \\\n\
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    vxc_short4 dst; \\\n\
+    _viv_asm(COPY, dst, dst1, 8); \\\n\
+    VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func)\n\
+\n\
+_viv_uniform float hstate_in_scale;\n\
+_viv_uniform float hstate_in_tail;\n\
+#define GRUCELL_QNT_F16TO_F16(name0, act_name, act_func, src0_type) \\\n\
+__kernel void grucell_h_times_activation_r_##name0##_F16toF16_##act_name( \\\n\
+    __read_only  image2d_t hstate_in, \\\n\
+    __read_only  image2d_t input_r_conv, \\\n\
+    __read_only  image2d_t hstate_r_conv, \\\n\
+    __write_only image2d_t output \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    src0_type src3; \\\n\
+    vxc_short8 v0, v1, v2, v3, v4, v5, v6; \\\n\
+    vxc_half8 src0, src1, src2, src4, src5, src6; \\\n\
+    VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src0, v0, 16); \\\n\
+    VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src1, v1, 16); \\\n\
+    VXC_ReadImage(src3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 r; \\\n\
+    VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\
+    r = act_func(r); \\\n\
+    float4 h_tm; \\\n\
+    VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\
+    h_tm = h_tm * hstate_in_scale + hstate_in_tail; \\\n\
+    float4 result = r * h_tm; \\\n\
+    half4 dst0; \\\n\
+    _viv_asm(CONV_RTE, dst0, result); \\\n\
+    vxc_half8 dst1; \\\n\
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    vxc_short4 dst; \\\n\
+    _viv_asm(COPY, dst, dst1, 8); \\\n\
+    VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+GRUCELL_QNT_F16TO_F16(U8,  SIGMOID, sigmoid_func, vxc_uchar8)\n\
+GRUCELL_QNT_F16TO_F16(I8,  SIGMOID, sigmoid_func, vxc_char8)\n\
+GRUCELL_QNT_F16TO_F16(I16, SIGMOID, sigmoid_func, vxc_short8)\n\
+"; /* end of grucell_h_times_activation_r_vx*/
+
+static const char grucell_reset_after_activation_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+#define logE        (1.44269502f)\n\
+#define twoLogE     (logE * 2.0f)\n\
+\n\
+float4 sigmoid_func(float4 x)\n\
+{\n\
+    x *= -logE;\n\
+    x = 1.0f + exp2(x);\n\
+    return 1.0f / x;\n\
+}\n\
+float4 hard_sigmoid(float4 x)\n\
+{\n\
+    x = 0.2 * x + 0.5;\n\
+    x = clamp(x, 0, 1);\n\
+    return x;\n\
+}\n\
+float4 tanh_func(float4 x)\n\
+{\n\
+    x *= -twoLogE;\n\
+    x = 1 + exp2(x);\n\
+    x = 1.0f / x;\n\
+    return 2 * x - 1;\n\
+}\n\
+\n\
+_viv_uniform VXC_512Bits uniF16PlusF16_0_4x4;\n\
+_viv_uniform VXC_512Bits uniF16PlusF16_1_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertF16_0_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertF16_1_4x4;\n\
+_viv_uniform VXC_512Bits uniExtract8Data_2x8;\n\
+\n\
+#define GRUCELL_F16_F16TOF16(act_name, act_func) \\\n\
+__kernel void grucell_reset_after_activation_F16_F16toF16_##act_name( \\\n\
+    __read_only  image2d_t hstate_in, \\\n\
+    __read_only  image2d_t input_z_conv, \\\n\
+    __read_only  image2d_t input_r_conv, \\\n\
+    __read_only  image2d_t input_h_conv, \\\n\
+    __read_only  image2d_t hstate_z_conv, \\\n\
+    __read_only  image2d_t hstate_r_conv, \\\n\
+    __read_only  image2d_t hstate_h_conv, \\\n\
+    __write_only image2d_t output, \\\n\
+    __write_only image2d_t hstate_out \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    vxc_short8 v0, v1, v2, v3, v4, v5, v6; \\\n\
+    vxc_half8 src0, src1, src2, src3, src4, src5, src6; \\\n\
+    VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src0, v0, 16); \\\n\
+    VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src1, v1, 16); \\\n\
+    VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src2, v2, 16); \\\n\
+    VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src4, v4, 16); \\\n\
+    VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src5, v5, 16); \\\n\
+    VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src6, v6, 16); \\\n\
+    VXC_ReadImage(v3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src3, v3, 16); \\\n\
+ \\\n\
+    float4 r; \\\n\
+    VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\
+    r = act_func(r); \\\n\
+    float4 h0, h1; \\\n\
+    VXC_DP4x4(h1, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\
+    VXC_DP4x4(h0, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\
+    float4 h = h0 + r * h1; \\\n\
+    float4 z; \\\n\
+    VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\
+    z = act_func(z); \\\n\
+    h = tanh_func(h); \\\n\
+    float4 h_tm; \\\n\
+    VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\
+    float4 result = (1 - z) * h + z * h_tm; \\\n\
+    half4 dst0; \\\n\
+    _viv_asm(CONV_RTE, dst0, result); \\\n\
+    vxc_half4 dst1; \\\n\
+    VXC_DP2x8(dst1, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    vxc_short4 dst; \\\n\
+    _viv_asm(COPY, dst, dst1, 8); \\\n\
+    VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+GRUCELL_F16_F16TOF16(SIGMOID, sigmoid_func)\n\
+\n\
+_viv_uniform float hstate_in_scale;\n\
+_viv_uniform float hstate_in_tail;\n\
+_viv_uniform float output_scale;\n\
+_viv_uniform float output_zp;\n\
+#define GRUCELL_QNT_F16TO_QNT(name0, name1, act_name, act_func, src0_type, dst_type) \\\n\
+__kernel void grucell_reset_after_activation_##name0##_F16to##name1##_##act_name( \\\n\
+    __read_only  image2d_t hstate_in, \\\n\
+    __read_only  image2d_t input_z_conv, \\\n\
+    __read_only  image2d_t input_r_conv, \\\n\
+    __read_only  image2d_t input_h_conv, \\\n\
+    __read_only  image2d_t hstate_z_conv, \\\n\
+    __read_only  image2d_t hstate_r_conv, \\\n\
+    __read_only  image2d_t hstate_h_conv, \\\n\
+    __write_only image2d_t output, \\\n\
+    __write_only image2d_t hstate_out \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    src0_type src3; \\\n\
+    vxc_short8 v0, v1, v2, v3, v4, v5, v6; \\\n\
+    vxc_half8 src0, src1, src2, src4, src5, src6; \\\n\
+    VXC_ReadImage(v0, input_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src0, v0, 16); \\\n\
+    VXC_ReadImage(v1, hstate_r_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src1, v1, 16); \\\n\
+    VXC_ReadImage(v2, hstate_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src2, v2, 16); \\\n\
+    VXC_ReadImage(v4, input_h_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src4, v4, 16); \\\n\
+    VXC_ReadImage(v5, input_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src5, v5, 16); \\\n\
+    VXC_ReadImage(v6, hstate_z_conv, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    _viv_asm(COPY, src6, v6, 16); \\\n\
+    VXC_ReadImage(src3, hstate_in, coord_in, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+ \\\n\
+    float4 r; \\\n\
+    VXC_DP4x4(r, src0, src1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\
+    r = act_func(r); \\\n\
+    float4 h0, h1; \\\n\
+    VXC_DP4x4(h1, src2, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\
+    VXC_DP4x4(h0, src4, src4, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\
+    float4 h = h0 + r * h1; \\\n\
+    float4 z; \\\n\
+    VXC_DP4x4(z, src5, src6, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniF16PlusF16_0_4x4); \\\n\
+    z = act_func(z); \\\n\
+    h = tanh_func(h); \\\n\
+    float4 h_tm; \\\n\
+    VXC_DP4x4(h_tm, src3, src3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniConvertF16_0_4x4); \\\n\
+    h_tm = h_tm * hstate_in_scale + hstate_in_tail; \\\n\
+    float4 result = (1 - z) * h + z * h_tm; \\\n\
+    result = result * output_scale + output_zp; \\\n\
+    int4 dst0; \\\n\
+    _viv_asm(CONV_RTE, dst0, result); \\\n\
+    dst_type dst; \\\n\
+    VXC_DP2x8(dst, dst0, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 1), uniExtract8Data_2x8); \\\n\
+    VXC_WriteImage(output, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+    VXC_WriteImage(hstate_out, coord_in, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0)); \\\n\
+}\n\
+GRUCELL_QNT_F16TO_QNT(U8,  U8,  SIGMOID, sigmoid_func, vxc_uchar8, vxc_uchar8)\n\
+GRUCELL_QNT_F16TO_QNT(I8,  I8,  SIGMOID, sigmoid_func, vxc_char8,  vxc_char8)\n\
+GRUCELL_QNT_F16TO_QNT(I16, I16, SIGMOID, sigmoid_func, vxc_short8, vxc_short8)\n\
+"; /* end of grucell_reset_after_activation_vx*/
+
 static const char hswish_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform float inputScale;\n\
@@ -11305,9 +12417,9 @@ L2NORMSCALE_AXIS0_2D(I8,  F16, I8,  char,   vxc_char8,  vxc_char8, r_inputScale,
 __kernel __attribute__((reqd_work_group_size(16, 1, 1))) \\\n\
      void l2normalizescale_axis0_U8_##in1_name##to##out_name##_2D \\\n\
     (\\\n\
-    __read_only  image2d_array_t input,\\\n\
-    __read_only  image2d_array_t scale,\\\n\
-    __write_only image2d_array_t output,\\\n\
+    __read_only  image2d_t input,\\\n\
+    __read_only  image2d_t scale,\\\n\
+    __write_only image2d_t output,\\\n\
     int axis\\\n\
     )\\\n\
 { \\\n\
@@ -28714,6 +29826,77 @@ __kernel void pre_process_gray_scale_U8to##dst_type_name \\\n\
 PRE_PROCESS_GRAY_SCALE_8BITS(U8, vxc_uchar16)\n\
 PRE_PROCESS_GRAY_SCALE_8BITS(I8, vxc_char16)"; /* end of pre_process_gray_vx*/
 
+static const char pre_process_gray_2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l00_2x8;\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l10_2x8;\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l01_4x4;\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l11_4x4;\n\
+_viv_uniform VXC_512Bits uniBilinear_4over3_l21_4x4;\n\
+\n\
+__kernel void pre_process_gray_4over3_U8toU8\n\
+    (\n\
+    __read_only image2d_array_t  input,\n\
+    __write_only image2d_array_t output,\n\
+        global int               *xRatio,\n\
+        global int               *yRatio,\n\
+        global int               *xOffset,\n\
+        global int               *yOffset,\n\
+               float             mean,\n\
+               float             f32Var\n\
+    )\n\
+{\n\
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
+\n\
+    vxc_uchar16 src0, src1, src2, src3;\n\
+\n\
+    VXC_ReadImage(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src1, input, coord_in, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src2, input, coord_in, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src3, input, coord_in, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    coord_in.xy = (coord_in.xy >> 2) * 3;\n\
+    coord_in.zw = coord_in.yy + (int2)(1, 2);\n\
+\n\
+    vxc_uchar16 dst0, dst1, dst2;\n\
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l00_2x8);\n\
+    VXC_DP2x8(dst0, src0, src0, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l10_2x8);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
+    VXC_DP4x4(dst1, src1, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l01_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(4, 7, 0, VXC_RM_ToNearestEven, 1),   uniBilinear_4over3_l11_4x4);\n\
+    VXC_DP4x4(dst2, src3, src2, VXC_MODIFIER(8, 11, 0, VXC_RM_ToNearestEven, 1),  uniBilinear_4over3_l21_4x4);\n\
+\n\
+    VXC_WriteImage(output, coord_in.xy, dst0, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_in.xz, dst1, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_in.xw, dst2, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void pre_process_gray_half_U8toU8\n\
+    (\n\
+    __read_only image2d_array_t  input,\n\
+    __write_only image2d_array_t output,\n\
+        global int               *xRatio,\n\
+        global int               *yRatio,\n\
+        global int               *xOffset,\n\
+        global int               *yOffset,\n\
+               float             mean,\n\
+               float             f32Var\n\
+    )\n\
+{\n\
+    int4 coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
+\n\
+    vxc_uchar16 src0;\n\
+\n\
+    VXC_ReadImage(src0, input, coord_in, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    coord_in.xy = coord_in.xy >> 1;\n\
+\n\
+    VXC_WriteImage(output, coord_in.xy, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of pre_process_gray_2_vx*/
+
 static const char pre_process_gray_copy_vx[] = "/*\n\
  ============================================================================\n\
  Name        : GrayScale.vx\n\
@@ -37915,6 +39098,211 @@ __kernel void resize_bilinear_U8toU8_UP_opt\n\
 \n\
 #endif"; /* end of resize_bilinear_U8_opt_vx*/
 
+static const char resize_bilinear_nhwc_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniResize_x2_nhwc2_0_4x8;\n\
+_viv_uniform VXC_512Bits uniResize_x2_nhwc2_1_4x8;\n\
+_viv_uniform int out_height;\n\
+\n\
+__kernel void resize_bilinear_nhwc_U8toU8_2x_upsample_half_pixel_centers\n\
+    (\n\
+    __read_only  image2d_array_t   input,\n\
+    __write_only image2d_array_t   output,\n\
+                             int   align_corners,\n\
+                             int   half_pixel_centers\n\
+    )\n\
+{\n\
+    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), 0);\n\
+    int4 coord_in   = (int4)(get_global_id(0), get_global_id(1), get_global_id(1), 0);\n\
+    coord_in.x = ((coord_out.x * 2 - 1) >> 2) - 1;\n\
+    coord_in.y = ((coord_out.y * 2 - 1) >> 2);\n\
+    coord_in.x  = coord_out.x == 0 ? -2 : coord_in.x;\n\
+    coord_in.y  = coord_out.y == 0 ? -1 : coord_in.y;\n\
+\n\
+    vxc_uchar16 in0, in1, in2, in3, result;\n\
+\n\
+    VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(in3, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_DP4x8(result, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);\n\
+    VXC_DP4x8(result, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_1_4x8);\n\
+    VXC_WriteImage(output, coord_out, result, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+    coord_out.y++;\n\
+    VXC_DP4x8(result, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);\n\
+    VXC_DP4x8(result, in2, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_1_4x8);\n\
+    VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+    coord_out.y++;\n\
+    VXC_DP4x8(result, in1, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);\n\
+    VXC_DP4x8(result, in1, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_1_4x8);\n\
+    VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+    coord_out.y++;\n\
+    VXC_DP4x8(result, in3, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_0_4x8);\n\
+    VXC_DP4x8(result, in3, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x2_nhwc2_1_4x8);\n\
+    VXC_WriteImage(output, coord_out.xy, result, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l00_2x8;\n\
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l01_2x8;\n\
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l02_2x8;\n\
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l03_2x8;\n\
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l10_4x4;\n\
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l11_4x4;\n\
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l12_4x4;\n\
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l13_4x4;\n\
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l14_4x4;\n\
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l15_4x4;\n\
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l16_4x4;\n\
+_viv_uniform VXC_512Bits uniResize_x3_nhwc2_l17_4x4;\n\
+__kernel void resize_bilinear_nhwc_U8toU8_3x_upsample_half_pixel_centers\n\
+    (\n\
+    __read_only  image2d_array_t   input,\n\
+    __write_only image2d_array_t   output,\n\
+                             int   align_corners,\n\
+                             int   half_pixel_centers\n\
+    )\n\
+{\n\
+    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+    int4 coord_in   = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+    coord_in.x = (short)(coord_out.x - 1) / (short)6  * 2;\n\
+    coord_in.x  = coord_out.x == 0 ? -2 : coord_in.x;\n\
+    coord_in.y = (short)(coord_out.y * 2 - 1) / (short)6;\n\
+    coord_in.y  = coord_out.y == 0 ? -1 : coord_in.y;\n\
+\n\
+    vxc_uchar16 in0, in1, in2, in3, dst0, dst1, dst2;\n\
+\n\
+    VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(in3, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    coord_out.zw = coord_out.xy + (int2)(16, 1);\n\
+\n\
+    VXC_DP4x4(dst0, in1, in0, VXC_MODIFIER(0,   3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);\n\
+    VXC_DP4x4(dst0, in1, in0, VXC_MODIFIER(4,   7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l11_4x4);\n\
+    VXC_DP4x4(dst0, in1, in0, VXC_MODIFIER(8,  11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l12_4x4);\n\
+    VXC_DP4x4(dst0, in1, in0, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l13_4x4);\n\
+    VXC_DP4x4(dst1, in1, in0, VXC_MODIFIER(0,   3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l14_4x4);\n\
+    VXC_DP4x4(dst1, in1, in0, VXC_MODIFIER(4,   7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l15_4x4);\n\
+    VXC_DP4x4(dst1, in1, in0, VXC_MODIFIER(8,  11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l16_4x4);\n\
+    VXC_DP4x4(dst1, in1, in0, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l17_4x4);\n\
+\n\
+    VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_out.zy, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_DP2x8(dst0, in1, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize_x3_nhwc2_l00_2x8);\n\
+    VXC_DP2x8(dst0, in1, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),  uniResize_x3_nhwc2_l01_2x8);\n\
+    VXC_DP2x8(dst1, in1, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize_x3_nhwc2_l02_2x8);\n\
+    VXC_DP2x8(dst1, in1, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),  uniResize_x3_nhwc2_l03_2x8);\n\
+    VXC_WriteImage(output, coord_out.xw, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_out.zw, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0));\n\
+    coord_out.yw += 2;\n\
+\n\
+    VXC_DP4x4(dst0, in1, in2, VXC_MODIFIER(0,   3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);\n\
+    VXC_DP4x4(dst0, in1, in2, VXC_MODIFIER(4,   7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l11_4x4);\n\
+    VXC_DP4x4(dst0, in1, in2, VXC_MODIFIER(8,  11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l12_4x4);\n\
+    VXC_DP4x4(dst0, in1, in2, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l13_4x4);\n\
+    VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(0,   3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l14_4x4);\n\
+    VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(4,   7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l15_4x4);\n\
+    VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(8,  11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l16_4x4);\n\
+    VXC_DP4x4(dst1, in1, in2, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l17_4x4);\n\
+    VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_out.zy, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_DP4x4(dst0, in2, in1, VXC_MODIFIER(0,   3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);\n\
+    VXC_DP4x4(dst0, in2, in1, VXC_MODIFIER(4,   7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l11_4x4);\n\
+    VXC_DP4x4(dst0, in2, in1, VXC_MODIFIER(8,  11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l12_4x4);\n\
+    VXC_DP4x4(dst0, in2, in1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l13_4x4);\n\
+    VXC_DP4x4(dst1, in2, in1, VXC_MODIFIER(0,   3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l14_4x4);\n\
+    VXC_DP4x4(dst1, in2, in1, VXC_MODIFIER(4,   7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l15_4x4);\n\
+    VXC_DP4x4(dst1, in2, in1, VXC_MODIFIER(8,  11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l16_4x4);\n\
+    VXC_DP4x4(dst1, in2, in1, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l17_4x4);\n\
+    VXC_WriteImage(output, coord_out.xw, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_out.zw, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0));\n\
+    coord_out.yw += 2;\n\
+\n\
+    VXC_DP2x8(dst0, in2, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize_x3_nhwc2_l00_2x8);\n\
+    VXC_DP2x8(dst0, in2, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),  uniResize_x3_nhwc2_l01_2x8);\n\
+    VXC_DP2x8(dst1, in2, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),   uniResize_x3_nhwc2_l02_2x8);\n\
+    VXC_DP2x8(dst1, in2, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1),  uniResize_x3_nhwc2_l03_2x8);\n\
+    VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_out.zy, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_DP4x4(dst0, in2, in3, VXC_MODIFIER(0,   3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l10_4x4);\n\
+    VXC_DP4x4(dst0, in2, in3, VXC_MODIFIER(4,   7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l11_4x4);\n\
+    VXC_DP4x4(dst0, in2, in3, VXC_MODIFIER(8,  11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l12_4x4);\n\
+    VXC_DP4x4(dst0, in2, in3, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l13_4x4);\n\
+    VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(0,   3, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l14_4x4);\n\
+    VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(4,   7, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l15_4x4);\n\
+    VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(8,  11, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l16_4x4);\n\
+    VXC_DP4x4(dst1, in2, in3, VXC_MODIFIER(12, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x3_nhwc2_l17_4x4);\n\
+    VXC_WriteImage(output, coord_out.xw, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_out.zw, dst1, VXC_MODIFIER(0, 13, 0,VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l00_4x8;\n\
+_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l01_4x8;\n\
+_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l10_4x8;\n\
+_viv_uniform VXC_512Bits uniResize_x4_nhwc2_l11_4x8;\n\
+__kernel void resize_bilinear_nhwc_U8toU8_4x_upsample_half_pixel_centers\n\
+    (\n\
+    __read_only  image2d_array_t   input,\n\
+    __write_only image2d_array_t   output,\n\
+                             int   align_corners,\n\
+                             int   half_pixel_centers\n\
+     )\n\
+{\n\
+    int4 coord_out  =  (int4)(get_global_id(0), get_global_id(1), get_global_id(1), get_global_id(1));\n\
+    int4 coord_in   = (int4)(get_global_id(0), -1, 0, 0);\n\
+    coord_in.x = ((coord_out.x - 3) >> 3) * 2;\n\
+    coord_in.y = (coord_out.y * 2 - 3) >> 3;\n\
+    coord_in.x  = coord_out.x == 0 ? -2 : coord_in.x;\n\
+    coord_in.y  = coord_out.y == 0 ? -1 : coord_in.y;\n\
+\n\
+    vxc_uchar16 in0, in1, in2, in3, dst0, dst1;\n\
+\n\
+    VXC_ReadImage(in0, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(in1, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(in2, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 2), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(in3, input, coord_in.xy, VXC_5BITOFFSET_XY(0, 3), VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    coord_out.zw = coord_out.yy + (int2)(1, 2);\n\
+\n\
+    VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8);\n\
+    VXC_DP4x8(dst0, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l01_4x8);\n\
+    VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8);\n\
+    VXC_DP4x8(dst1, in0, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l11_4x8);\n\
+    VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    coord_out.yz = coord_out.yz + (int2)(3, 3);\n\
+\n\
+    VXC_DP4x8(dst0, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8);\n\
+    VXC_DP4x8(dst0, in2, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l11_4x8);\n\
+    VXC_DP4x8(dst1, in2, in1, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize_x4_nhwc2_l00_4x8);\n\
+    VXC_DP4x8(dst1, in2, in1, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l01_4x8);\n\
+    VXC_WriteImage(output, coord_out.xw, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_out.xy, dst1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    coord_out.yw = coord_out.yw + (int2)(3, 3);\n\
+\n\
+    VXC_DP4x8(dst0, in1, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l00_4x8);\n\
+    VXC_DP4x8(dst0, in1, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l01_4x8);\n\
+    VXC_DP4x8(dst1, in1, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize_x4_nhwc2_l10_4x8);\n\
+    VXC_DP4x8(dst1, in1, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l11_4x8);\n\
+    VXC_WriteImage(output, coord_out.xz, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_out.xw, dst1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+\n\
+    coord_out.zw = coord_out.zw + (int2)(3, 3);\n\
+\n\
+    VXC_DP4x8(dst0, in3, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l10_4x8);\n\
+    VXC_DP4x8(dst0, in3, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l11_4x8);\n\
+    VXC_DP4x8(dst1, in3, in2, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1),  uniResize_x4_nhwc2_l00_4x8);\n\
+    VXC_DP4x8(dst1, in3, in2, VXC_MODIFIER(8, 15, 0, VXC_RM_ToNearestEven, 1), uniResize_x4_nhwc2_l01_4x8);\n\
+    VXC_WriteImage(output, coord_out.xy, dst0, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_out.xz, dst1, VXC_MODIFIER(0, 15, 0,VXC_RM_TowardZero, 0));\n\
+}"; /* end of resize_bilinear_nhwc_vx*/
+
 static const char resize_nearest_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniMultiplyAndPostShift_2x8;\n\
@@ -38502,6 +39890,63 @@ __kernel void scatter_nd_update_F16F16toF16(\n\
     VXC_WriteImage(output, coord, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
+__kernel void scatter_nd_update_F16F16toU8(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __read_only image2d_t   input2,\n\
+    image2d_array_t  output,\n\
+    int width,\n\
+    int area,\n\
+    int vol,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+    int cnt = 0;\n\
+\n\
+    vxc_short8 tmpVal = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    vxc_half8 sum;\n\
+    _viv_asm(COPY, sum, tmpVal, 16);\n\
+    Image img1 = create_image_from_image2d(input1, 4);\n\
+    __global int* index_ptr = (__global int*)img1.ptr;\n\
+    for(int i = 0; i < index_num; i++)\n\
+    {\n\
+        //int4 indice = read_imagei(input1, (int2)(0, i));\n\
+        int4 indice = vload4(0, index_ptr + offset_idx);\n\
+        index_ptr += coord_dim;\n\
+        int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW;\n\
+        if(gidy == idx)\n\
+        {\n\
+            vxc_half8 src;\n\
+            VXC_ReadImage(tmpVal, input2, (int2)(gidx, i), 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            cnt++;\n\
+            _viv_asm(COPY, src, tmpVal, 16);\n\
+            VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8);\n\
+        }\n\
+    }\n\
+    int2 coord = (int2)(gidx, gidy);\n\
+    vxc_ushort8 ms0;\n\
+    vxc_uchar8 dst;\n\
+    if(cnt == 0)\n\
+    {\n\
+        vxc_half8 src;\n\
+        VXC_ReadImage(tmpVal, input0, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\
+        _viv_asm(COPY, src, tmpVal, 16);\n\
+        VXC_DP2x8(dst, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\
+                    uniU8MulAndPostShift_0_Lo_2x8);\n\
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+    else\n\
+    {\n\
+        _viv_asm(COPY, ms0, multAndoutZP1, 16);\n\
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\
+                    uniU8MulAndPostShift_1_Lo_2x8);\n\
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
 #define SCATTER_ND_UPDATE_QINT(src0_type_name, src2_type_name, out_type_name, data_type) \\\n\
 __kernel void scatter_nd_update_##src0_type_name##src2_type_name##to##out_type_name##( \\\n\
     __read_only image2d_t   input0, \\\n\
@@ -38878,6 +40323,11 @@ _viv_uniform int offsetZ;\n\
 _viv_uniform int offsetW;\n\
 _viv_uniform int offset_idx;\n\
 \n\
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_1_Lo_2x8;\n\
+_viv_uniform int2 multAndoutZP0;\n\
+_viv_uniform int2 multAndoutZP1;//[0:15] multiplier, [31:63] output zp\n\
+\n\
 __kernel void scatter_nd_update_F16F16toF16_big(\n\
     __read_only image2d_t   input0,\n\
     __read_only image2d_t   input1,\n\
@@ -38929,6 +40379,70 @@ __kernel void scatter_nd_update_F16F16toF16_big(\n\
     }\n\
     output_ptr[loc] = dst;\n\
 }\n\
+\n\
+__kernel void scatter_nd_update_F16F16toU8_big(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __read_only image2d_t   input2,\n\
+    image2d_t  output,\n\
+    int width,\n\
+    int area,\n\
+    int vol,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);\n\
+    int gidy = get_global_id(1);\n\
+    int cnt = 0;\n\
+\n\
+    vxc_short8 tmpVal = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    vxc_half8 sum;\n\
+    _viv_asm(COPY, sum, tmpVal, 16);\n\
+    Image img1 = create_image_from_image2d(input1, 4);\n\
+    Image img2 = create_image_from_image2d(input2, 2);\n\
+    Image img3 = create_image_from_image2d(output, 1);\n\
+\n\
+    __global int* index_ptr = (__global int*)img1.ptr;\n\
+    __global short* update_ptr = (__global short*)img2.ptr;\n\
+    __global uchar* output_ptr = (__global uchar*)img3.ptr;\n\
+    for(int i = 0; i < index_num; i++)\n\
+    {\n\
+        int4 indice = vload4(0, index_ptr + offset_idx);\n\
+        index_ptr += coord_dim;\n\
+\n\
+        int idx = indice.x * offsetX + indice.y * offsetY + indice.z * offsetZ + indice.w * offsetW;\n\
+        if(gidy == idx)\n\
+        {\n\
+            vxc_half8 src;\n\
+            short tmpData = update_ptr[i * update_width + gidx];\n\
+            cnt++;\n\
+            _viv_asm(COPY, src, tmpData, 4);\n\
+            VXC_DP2x8(sum, sum, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccumulateSum_2x8);\n\
+        }\n\
+    }\n\
+    short dst;\n\
+    vxc_ushort8 ms0;\n\
+    int loc = gidy * output_width+ gidx;\n\
+    if(cnt == 0)\n\
+    {\n\
+        vxc_half8 src;\n\
+        Image img0 = create_image_from_image2d(input0, 2);\n\
+        __global short* ref_ptr = (__global short*)img0.ptr;\n\
+        short tmpData = ref_ptr[loc];\n\
+        _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\
+        _viv_asm(COPY, src, tmpData, 4);\n\
+        VXC_DP2x8(dst, src, ms0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1),\n\
+                    uniU8MulAndPostShift_0_Lo_2x8);\n\
+        output_ptr[loc] = dst;\n\
+    }\n\
+    else\n\
+    {\n\
+        _viv_asm(COPY, ms0, multAndoutZP1, 16);\n\
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1),\n\
+                    uniU8MulAndPostShift_1_Lo_2x8);\n\
+        output_ptr[loc] = dst;\n\
+    }\n\
+}\n\
 "; /* end of scatter_nd_update_big_vx*/
 
 static const char select_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -39898,11 +41412,15 @@ __kernel void tile_remain##name2##_##name0##to##name1( \\\n\
                 { \\\n\
                     coord_out.x = coord.x + x * width; \\\n\
                     if (isLastItem) \\\n\
+                    { \\\n\
                         VXC_WriteImage2DArray(output, coord_out, src, \\\n\
                             VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \\\n\
+                    } \\\n\
                     else \\\n\
+                    { \\\n\
                         VXC_WriteImage2DArray(output, coord_out, src, \\\n\
                             VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+                    } \\\n\
                 } \\\n\
             } \\\n\
         } \\\n\
@@ -39955,9 +41473,13 @@ __kernel void tile_remain##name2##_##name0##to##name1##_2D( \\\n\
         do \\\n\
         { \\\n\
             if (isLastItem) \\\n\
+            { \\\n\
                 VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \\\n\
+            } \\\n\
             else \\\n\
+            { \\\n\
                 VXC_WriteImage(output, coord, src, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            } \\\n\
             coord.x += width; \\\n\
         } while (coord.x < output_width); \\\n\
         coord.x = get_global_id(0); \\\n\
@@ -40017,9 +41539,6 @@ __kernel void tile_1toN_##name0##to##name1##_2D( \\\n\
 }\n\
 TILE_2D_1TON(U8,  U8, vxc_uchar8)\n\
 TILE_2D_1TON(I16, I16, vxc_short8)\n\
-\n\
-\n\
-\n\
 "; /* end of tile_vx*/
 
 static const char tile_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -40081,11 +41600,15 @@ __kernel void tile_remain##name2##_##name0##to##name1( \\\n\
                 { \\\n\
                     coord_out.x = coord.x + x * width; \\\n\
                     if (isLastItem) \\\n\
+                    { \\\n\
                         VXC_WriteImage2DArray(output, coord_out, dst, \\\n\
                             VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \\\n\
+                    } \\\n\
                     else \\\n\
+                    { \\\n\
                         VXC_WriteImage2DArray(output, coord_out, dst, \\\n\
                             VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+                    } \\\n\
                 } \\\n\
             } \\\n\
         } \\\n\
@@ -40138,9 +41661,13 @@ __kernel void tile_remain##name2##_##name0##to##name1##_2D( \\\n\
         do \\\n\
         { \\\n\
             if (isLastItem) \\\n\
+            { \\\n\
                 VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, remainder, 0, VXC_RM_TowardZero, 0)); \\\n\
+            } \\\n\
             else \\\n\
+            { \\\n\
                 VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            } \\\n\
             coord.x += width; \\\n\
         } while (coord.x < output_width); \\\n\
         coord.x = get_global_id(0); \\\n\
@@ -41444,13 +42971,13 @@ inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride
 {\n\
     int8 desc;\n\
     int2 strides;\n\
+    _viv_asm(COPY, desc, input, sizeof(desc));\n\
 #if (USE_40BITS_VA==0)\n\
     strides.x = desc.s1;\n\
     strides.y = desc.s4;\n\
 #else\n\
     _viv_asm(GET_IMAGE_STRIDE, strides, input);\n\
 #endif\n\
-    _viv_asm(COPY, desc, input, sizeof(desc));\n\
     uint address = as_uint(desc.s0);\n\
 \n\
     Tensor t =\n\
@@ -43341,13 +44868,14 @@ inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride
 {\n\
     int8 desc;\n\
     int2 strides;\n\
+    _viv_asm(COPY, desc, input, sizeof(desc));\n\
+\n\
 #if (USE_40BITS_VA==0)\n\
     strides.x = desc.s1;\n\
     strides.y = desc.s4;\n\
 #else\n\
     _viv_asm(GET_IMAGE_STRIDE, strides, input);\n\
 #endif\n\
-    _viv_asm(COPY, desc, input, sizeof(desc));\n\
     uint address = as_uint(desc.s0);\n\
 \n\
     Tensor t =\n\
@@ -43386,15 +44914,14 @@ inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride
        } while(0)\n\
 "; /* end of eltwise_ops_helper_cl*/
 
-static const char eltwise_unary_cl[] = "\n\
-float eltwise_unary_sin(float x, float alpha)\n\
+static const char eltwise_unary_cl[] = "float eltwise_unary_sin(float x, float alpha, float beta)\n\
 {\n\
     return native_sin(x);\n\
 }\n\
 \n\
 #define logE        (1.44269502f)\n\
 #define twoLogE     (logE * 2.0f)\n\
-float eltwise_unary_exp(float x, float alpha)\n\
+float eltwise_unary_exp(float x, float alpha, float beta)\n\
 {\n\
     x *= logE;\n\
     x = exp2(x);\n\
@@ -43402,13 +44929,13 @@ float eltwise_unary_exp(float x, float alpha)\n\
 }\n\
 \n\
 #define rlogE    (0.693147182f)\n\
-float eltwise_unary_log(float x, float alpha)\n\
+float eltwise_unary_log(float x, float alpha, float beta)\n\
 {\n\
     x = log2(x);\n\
     return x * rlogE;\n\
 }\n\
 \n\
-float eltwise_unary_elu(float val, float alpha)\n\
+float eltwise_unary_elu(float val, float alpha, float beta)\n\
 {\n\
     float x = val * logE;\n\
     x = exp2(x) * alpha - alpha;\n\
@@ -43416,14 +44943,14 @@ float eltwise_unary_elu(float val, float alpha)\n\
     return val < 0 ? x : val;\n\
 }\n\
 \n\
-float eltwise_unary_neg(float x, float alpha)\n\
+float eltwise_unary_neg(float x, float alpha, float beta)\n\
 {\n\
     return x * -1;\n\
 }\n\
 \n\
-float eltwise_unary_hard_sigmoid(float x, float alpha)\n\
+float eltwise_unary_hard_sigmoid(float x, float alpha, float beta)\n\
 {\n\
-    x = 0.2 * x + 0.5;\n\
+    x = alpha * x + beta;\n\
     x = clamp(x, 0, 1);\n\
     return x;\n\
 }\n\
@@ -43445,14 +44972,14 @@ float _tanh(float x, float alpha)\n\
     return (2 * x - 1);\n\
 }\n\
 \n\
-float eltwise_unary_mish(float x, float alpha)\n\
+float eltwise_unary_mish(float x, float alpha, float beta)\n\
 {\n\
     float y = _softrelu(x, alpha);\n\
     x = x * _tanh(y, alpha);\n\
     return x;\n\
 }\n\
 \n\
-float eltwise_unary_round(float x, float alpha)\n\
+float eltwise_unary_round(float x, float alpha, float beta)\n\
 {\n\
     return convert_float(convert_int_rte(x));\n\
 }\n\
@@ -43486,7 +45013,7 @@ float erf_eval(float x)\n\
     return res * MUL2_RSQRTPI;\n\
 }\n\
 #define RSQRT2      (0.70710678118654752440084436210485f)\n\
-float eltwise_unary_gelu(float x, float alpha)\n\
+float eltwise_unary_gelu(float x, float alpha, float beta)\n\
 {\n\
     x = 0.5f * x * (1 + erf_eval(x * RSQRT2));\n\
 \n\
@@ -43494,7 +45021,7 @@ float eltwise_unary_gelu(float x, float alpha)\n\
 }\n\
 \n\
 #define SQRT_2_RCP_PI  0.7978845834732056f\n\
-float eltwise_unary_hard_gelu(float x, float alpha)\n\
+float eltwise_unary_hard_gelu(float x, float alpha, float beta)\n\
 {\n\
     float cdf = 0.5f + 0.5f * _tanh(SQRT_2_RCP_PI *\n\
                         (x + 0.044715f * x * x * x), 0);\n\
@@ -43510,7 +45037,8 @@ __kernel void func_name##_F32toF32 \\\n\
                  float           inputTail, \\\n\
                  float           outputScale, \\\n\
                  float           outputZP, \\\n\
-                 float           alpha \\\n\
+                 float           alpha, \\\n\
+                 float           beta \\\n\
     ) \\\n\
 { \\\n\
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
@@ -43518,7 +45046,7 @@ __kernel void func_name##_F32toF32 \\\n\
     float4 src = read_imagef(input, coord); \\\n\
  \\\n\
     float4 dst = 0; \\\n\
-    dst.x = eltwise_unary_##func_name(src.x, alpha); \\\n\
+    dst.x = eltwise_unary_##func_name(src.x, alpha, beta); \\\n\
  \\\n\
     write_imagef(output, coord, dst.xxxx); \\\n\
 }\n\
@@ -43542,7 +45070,8 @@ __kernel void func_name##_F32toF32_2D \\\n\
                  float     inputTail, \\\n\
                  float     outputScale, \\\n\
                  float     outputZP, \\\n\
-                 float     alpha \\\n\
+                 float     alpha, \\\n\
+                 float           beta \\\n\
     ) \\\n\
 { \\\n\
     int2 coord =  (int2)(get_global_id(0), get_global_id(1)); \\\n\
@@ -43550,7 +45079,7 @@ __kernel void func_name##_F32toF32_2D \\\n\
     float4 src = read_imagef(input, coord); \\\n\
  \\\n\
     float4 dst = 0; \\\n\
-    dst.x = eltwise_unary_##func_name(src.x, alpha); \\\n\
+    dst.x = eltwise_unary_##func_name(src.x, alpha, beta); \\\n\
  \\\n\
     write_imagef(output, coord, dst.xxxx); \\\n\
 }\n\
@@ -43574,7 +45103,8 @@ __kernel void func_name##_U8toU8 \\\n\
                  float           inputTail, \\\n\
                  float           outputScale, \\\n\
                  float           outputZP, \\\n\
-                 float           alpha \\\n\
+                 float           alpha, \\\n\
+                 float           beta \\\n\
     ) \\\n\
 { \\\n\
     int4 coord =  (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
@@ -43582,7 +45112,7 @@ __kernel void func_name##_U8toU8 \\\n\
     uint4 src = read_imageui(input, coord); \\\n\
     float4 data = convert_float4(src) * inputScale - inputTail; \\\n\
  \\\n\
-    data.x = eltwise_unary_##func_name(data.x, alpha); \\\n\
+    data.x = eltwise_unary_##func_name(data.x, alpha, beta); \\\n\
     uint4 dst = convert_uint4(data * outputScale + outputZP); \\\n\
  \\\n\
     write_imageui(output, coord, dst); \\\n\
@@ -43607,7 +45137,8 @@ __kernel void func_name##_U8toU8_2D \\\n\
                  float     inputTail, \\\n\
                  float     outputScale, \\\n\
                  float     outputZP, \\\n\
-                 float     alpha \\\n\
+                 float     alpha, \\\n\
+                 float     beta \\\n\
     ) \\\n\
 { \\\n\
     int2 coord =  (int2)(get_global_id(0), get_global_id(1)); \\\n\
@@ -43615,7 +45146,7 @@ __kernel void func_name##_U8toU8_2D \\\n\
     uint4 src = read_imageui(input, coord); \\\n\
     float4 data = convert_float4(src) * inputScale - inputTail; \\\n\
  \\\n\
-    data.x = eltwise_unary_##func_name(data.x, alpha); \\\n\
+    data.x = eltwise_unary_##func_name(data.x, alpha, beta); \\\n\
     uint4 dst = convert_uint4(data * outputScale + outputZP); \\\n\
  \\\n\
     write_imageui(output, coord, dst); \\\n\
@@ -43639,7 +45170,8 @@ __kernel void neg_I32toI32\n\
                  float           inputTail,\n\
                  float           outputScale,\n\
                  float           outputZP,\n\
-                 float           alpha\n\
+                 float           alpha,\n\
+                 float           beta\n\
     )\n\
 {\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
@@ -43658,7 +45190,8 @@ __kernel void neg_I32toI32_2D\n\
                  float     inputTail,\n\
                  float     outputScale,\n\
                  float     outputZP,\n\
-                 float     alpha\n\
+                 float     alpha,\n\
+                 float     beta\n\
     )\n\
 {\n\
     int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
@@ -43854,7 +45387,10 @@ __kernel void floordiv_I32I32toU8(\n\
     int4 src1;\n\
     READ_IMAGEI_2DARRAY(src0, input, coord);\n\
     READ_IMAGEI_2DARRAY(src1, input1, coord);\n\
-    uint4 dst  = convert_uint4(floor(convert_float4(src0) / convert_float4(src1)) * outputScale + outputTail);\n\
+    float4 in0 = convert_float4(src0) * input0Scale + input0Tail;\n\
+    float4 in1 = convert_float4(src1) * input1Scale + input1Tail;\n\
+    float4 out = floor(in0 / in1) * outputScale + outputTail;\n\
+    uint4 dst = convert_uint4(out);\n\
     write_imageui(output, coord, dst);\n\
 }\n\
 \n\
@@ -43872,7 +45408,10 @@ __kernel void floordiv_I32I32toU8_2D(\n\
     int2 coord =  (int2)(get_global_id(0), get_global_id(1));\n\
     int4 src0 = read_imagei(input, coord);\n\
     int4 src1 = read_imagei(input1, coord);\n\
-    uint4 dst  = convert_uint4(floor(convert_float4(src0) / convert_float4(src1)) * outputScale + outputTail);\n\
+    float4 in0 = convert_float4(src0) * input0Scale + input0Tail;\n\
+    float4 in1 = convert_float4(src1) * input1Scale + input1Tail;\n\
+    float4 out = floor(in0 / in1) * outputScale + outputTail;\n\
+    uint4 dst = convert_uint4(out);\n\
     write_imageui(output, coord, dst);\n\
 }\n\
 \n\
@@ -45154,6 +46693,363 @@ static const char grucell_activation_sma_cl[] = "__kernel void grucell_activatio
 }\n\
 "; /* end of grucell_activation_sma_cl*/
 
+static const char grucell_activation_z_h_cl[] = "#define logE        (1.44269502f)\n\
+#define twoLogE     (logE * 2.0f)\n\
+\n\
+float sigmoid(float x)\n\
+{\n\
+    x *= -logE;\n\
+    x = 1 + exp2(x);\n\
+    return 1 / x;\n\
+}\n\
+float hard_sigmoid(float x)\n\
+{\n\
+    x = 0.2 * x + 0.5;\n\
+    x = clamp(x, 0, 1);\n\
+    return x;\n\
+}\n\
+float tanh_func(float x)\n\
+{\n\
+    x *= -twoLogE;\n\
+    x = 1 + exp2(x);\n\
+    x = 1 / x;\n\
+    return 2 * x - 1;\n\
+}\n\
+\n\
+\n\
+#define GRUCELL_ACTIVATION_U8_F32_U8(act_name, act_func) \\\n\
+__kernel void grucell_activation_z_h_U8_F32toU8_##act_name( \\\n\
+    __read_only  image2d_t        hstate_in, \\\n\
+    __read_only  image2d_t        input_z_conv, \\\n\
+    __read_only  image2d_t        input_h_conv, \\\n\
+    __read_only  image2d_t        hstate_z_conv, \\\n\
+    __read_only  image2d_t        hstate_h_conv, \\\n\
+    __write_only image2d_t        output, \\\n\
+    __write_only image2d_t        hstate_out, \\\n\
+    float input_scale, float input_tail, float output_scale, float output_zp) \\\n\
+{ \\\n\
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    float4  src0, src1, src2, src3; \\\n\
+    float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\
+    float4 h_tm = convert_float4(read_imageui(hstate_in, coord_in.xy)); \\\n\
+    float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \\\n\
+    float4 h0 = read_imagef(input_h_conv, coord_in.xy); \\\n\
+    float4 z0 = read_imagef(input_z_conv, coord_in.xy); \\\n\
+    float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \\\n\
+ \\\n\
+    h_tm = h_tm * input_scale + input_tail; \\\n\
+    float4 h = h0 + h1; \\\n\
+    float4 z = z0 + z1; \\\n\
+    z.x = act_func(z.x); \\\n\
+    h = tanh_func(h.x); \\\n\
+    float4 dst = (1 - z ) * h + z * h_tm; \\\n\
+    dst = dst * output_scale + output_zp; \\\n\
+    uint4 result = convert_uint4_sat_rte(dst); \\\n\
+    write_imageui(output, coord_in.xy, result); \\\n\
+    write_imageui(hstate_out, coord_in.xy, result); \\\n\
+}\n\
+GRUCELL_ACTIVATION_U8_F32_U8(SIGMOID, sigmoid)\n\
+//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)\n\
+\n\
+#define GRUCELL_ACTIVATION_F32_F32_F32(act_name, act_func) \\\n\
+__kernel void grucell_activation_z_h_F32_F32toF32_##act_name( \\\n\
+    __read_only  image2d_t        hstate_in, \\\n\
+    __read_only  image2d_t        input_z_conv, \\\n\
+    __read_only  image2d_t        input_h_conv, \\\n\
+    __read_only  image2d_t        hstate_z_conv, \\\n\
+    __read_only  image2d_t        hstate_h_conv, \\\n\
+    __write_only image2d_t        output, \\\n\
+    __write_only image2d_t        hstate_out, \\\n\
+    float input_scale, float input_tail, float output_scale, float output_zp) \\\n\
+{ \\\n\
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    float4  src0, src1, src2, src3; \\\n\
+    float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\
+    float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \\\n\
+    float4 h0 = read_imagef(input_h_conv, coord_in.xy); \\\n\
+    float4 z0 = read_imagef(input_z_conv, coord_in.xy); \\\n\
+    float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \\\n\
+    float4 h_tm = read_imagef(hstate_in, coord_in.xy); \\\n\
+ \\\n\
+    float4 h = h0 + h1; \\\n\
+    float4 z = z0 + z1; \\\n\
+    z.x = act_func(z.x); \\\n\
+    h = tanh_func(h.x); \\\n\
+    float4 dst = (1 - z ) * h + z * h_tm; \\\n\
+    write_imagef(output, coord_in.xy, dst); \\\n\
+    write_imagef(hstate_out, coord_in.xy, dst); \\\n\
+}\n\
+\n\
+GRUCELL_ACTIVATION_F32_F32_F32(SIGMOID, sigmoid)\n\
+//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)\n\
+\n\
+#define GRUCELL_ACTIVATION_I32_F32_I32(act_name, act_func) \\\n\
+__kernel void grucell_activation_z_h_I32_F32toI32_##act_name( \\\n\
+    __read_only  image2d_t        hstate_in, \\\n\
+    __read_only  image2d_t        input_z_conv, \\\n\
+    __read_only  image2d_t        input_h_conv, \\\n\
+    __read_only  image2d_t        hstate_z_conv, \\\n\
+    __read_only  image2d_t        hstate_h_conv, \\\n\
+    __write_only image2d_t        output, \\\n\
+    __write_only image2d_t        hstate_out, \\\n\
+    float input_scale, float input_tail, float output_scale, float output_zp) \\\n\
+{ \\\n\
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    float4  src0, src1, src2, src3; \\\n\
+    float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\
+    float4 h_tm = convert_float4(read_imagei(hstate_in, coord_in.xy)); \\\n\
+    float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \\\n\
+    float4 h0 = read_imagef(input_h_conv, coord_in.xy); \\\n\
+    float4 z0 = read_imagef(input_z_conv, coord_in.xy); \\\n\
+    float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \\\n\
+ \\\n\
+    h_tm = h_tm * input_scale + input_tail; \\\n\
+    float4 h = h0 + h1; \\\n\
+    float4 z = z0 + z1; \\\n\
+    z.x = act_func(z.x); \\\n\
+    h = tanh_func(h.x); \\\n\
+    float4 dst = (1 - z ) * h + z * h_tm; \\\n\
+    dst = dst * output_scale + output_zp; \\\n\
+    int4 result = convert_int4_sat_rte(dst); \\\n\
+    write_imagei(output, coord_in.xy, result); \\\n\
+    write_imagei(hstate_out, coord_in.xy, result); \\\n\
+}\n\
+GRUCELL_ACTIVATION_I32_F32_I32(SIGMOID, sigmoid)\n\
+//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)"; /* end of grucell_activation_z_h_cl*/
+
+static const char grucell_h_times_activation_r_cl[] = "#define logE        (1.44269502f)\n\
+#define twoLogE     (logE * 2.0f)\n\
+\n\
+float sigmoid(float x)\n\
+{\n\
+    x *= -logE;\n\
+    x = 1 + exp2(x);\n\
+    return 1 / x;\n\
+}\n\
+float hard_sigmoid(float x)\n\
+{\n\
+    x = 0.2 * x + 0.5;\n\
+    x = clamp(x, 0, 1);\n\
+    return x;\n\
+}\n\
+\n\
+#define GRUCELL_H_TIMES_R_U8_F32_F32(act_name, act_func) \\\n\
+__kernel void grucell_h_times_activation_r_U8_F32toF32_##act_name( \\\n\
+    __read_only  image2d_t        hstate_in, \\\n\
+    __read_only  image2d_t        input_r_conv, \\\n\
+    __read_only  image2d_t        hstate_r_conv, \\\n\
+    __write_only image2d_t        output, \\\n\
+    float input_scale, float input_tail) \\\n\
+{ \\\n\
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    float4  src0, src1, src2, src3; \\\n\
+    float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\
+    float4 r0 = read_imagef(input_r_conv, coord_in.xy); \\\n\
+    float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \\\n\
+    float4 h_tm = convert_float4(read_imageui(hstate_in, coord_in.xy)); \\\n\
+ \\\n\
+    float4 r = r0 + r1; \\\n\
+    r.x = act_func(r.x); \\\n\
+    h_tm = h_tm * input_scale + input_tail; \\\n\
+    float4 r_times_h = r * h_tm; \\\n\
+    write_imagef(output, coord_in.xy, r_times_h); \\\n\
+}\n\
+GRUCELL_H_TIMES_R_U8_F32_F32(SIGMOID, sigmoid)\n\
+//GRUCELL_H_TIMES_R_U8_F32_F32(HARD_SIGMOID, hard_sigmoid)\n\
+\n\
+#define GRUCELL_H_TIMES_R_F32_F32_F32(act_name, act_func) \\\n\
+__kernel void grucell_h_times_activation_r_F32_F32toF32_##act_name( \\\n\
+    __read_only  image2d_t hstate_in, \\\n\
+    __read_only  image2d_t input_r_conv, \\\n\
+    __read_only  image2d_t hstate_r_conv, \\\n\
+    __write_only image2d_t output, \\\n\
+    float input_scale, float input_tail) \\\n\
+{ \\\n\
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    float4  src0, src1, src2, src3; \\\n\
+    float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\
+    float4 r0 = read_imagef(input_r_conv, coord_in.xy); \\\n\
+    float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \\\n\
+    float4 h_tm = read_imagef(hstate_in, coord_in.xy); \\\n\
+ \\\n\
+    float4 r = r0 + r1; \\\n\
+    r.x = act_func(r.x); \\\n\
+    float4 r_times_h = r * h_tm; \\\n\
+    write_imagef(output, coord_in.xy, r_times_h); \\\n\
+}\n\
+\n\
+GRUCELL_H_TIMES_R_F32_F32_F32(SIGMOID, sigmoid)\n\
+//GRUCELL_H_TIMES_R_F32_F32_F32(HARD_SIGMOID, hard_sigmoid)\n\
+\n\
+#define GRUCELL_H_TIMES_R_I32_F32_F32(act_name, act_func) \\\n\
+__kernel void grucell_h_times_activation_r_I32_F32toI32_##act_name( \\\n\
+    __read_only  image2d_t        hstate_in, \\\n\
+    __read_only  image2d_t        input_r_conv, \\\n\
+    __read_only  image2d_t        hstate_r_conv, \\\n\
+    __write_only image2d_t        output, \\\n\
+    float input_scale, float input_tail) \\\n\
+{ \\\n\
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    float4  src0, src1, src2, src3; \\\n\
+    float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\
+    float4 r0 = read_imagef(input_r_conv, coord_in.xy); \\\n\
+    float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \\\n\
+    float4 h_tm = convert_float4(read_imagei(hstate_in, coord_in.xy)); \\\n\
+ \\\n\
+    float4 r = r0 + r1; \\\n\
+    r.x = act_func(r.x); \\\n\
+    h_tm = h_tm * input_scale + input_tail; \\\n\
+    float4 r_times_h = r * h_tm; \\\n\
+    write_imagef(output, coord_in.xy, r_times_h); \\\n\
+}\n\
+GRUCELL_H_TIMES_R_I32_F32_F32(SIGMOID, sigmoid)\n\
+//GRUCELL_H_TIMES_R_I32_F32_F32(HARD_SIGMOID, hard_sigmoid)"; /* end of grucell_h_times_activation_r_cl*/
+
+static const char grucell_reset_after_activation_cl[] = "#define logE        (1.44269502f)\n\
+#define twoLogE     (logE * 2.0f)\n\
+\n\
+float sigmoid(float x)\n\
+{\n\
+    x *= -logE;\n\
+    x = 1 + exp2(x);\n\
+    return 1 / x;\n\
+}\n\
+float hard_sigmoid(float x)\n\
+{\n\
+    x = 0.2 * x + 0.5;\n\
+    x = clamp(x, 0, 1);\n\
+    return x;\n\
+}\n\
+float tanh_func(float x)\n\
+{\n\
+    x *= -twoLogE;\n\
+    x = 1 + exp2(x);\n\
+    x = 1 / x;\n\
+    return 2 * x - 1;\n\
+}\n\
+\n\
+\n\
+#define GRUCELL_ACTIVATION_U8_F32_U8(act_name, act_func) \\\n\
+__kernel void grucell_reset_after_activation_U8_F32toU8_##act_name( \\\n\
+    __read_only  image2d_t        hstate_in, \\\n\
+    __read_only  image2d_t        input_z_conv, \\\n\
+    __read_only  image2d_t        input_r_conv, \\\n\
+    __read_only  image2d_t        input_h_conv, \\\n\
+    __read_only  image2d_t        hstate_z_conv, \\\n\
+    __read_only  image2d_t        hstate_r_conv, \\\n\
+    __read_only  image2d_t        hstate_h_conv, \\\n\
+    __write_only image2d_t        output, \\\n\
+    __write_only image2d_t        hstate_out, \\\n\
+    float input_scale, float input_tail, float output_scale, float output_zp) \\\n\
+{ \\\n\
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    float4  src0, src1, src2, src3; \\\n\
+    float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\
+    float4 r0 = read_imagef(input_r_conv, coord_in.xy); \\\n\
+    float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \\\n\
+    float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \\\n\
+    float4 h_tm = convert_float4(read_imageui(hstate_in, coord_in.xy)); \\\n\
+    float4 h0 = read_imagef(input_h_conv, coord_in.xy); \\\n\
+    float4 z0 = read_imagef(input_z_conv, coord_in.xy); \\\n\
+    float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \\\n\
+ \\\n\
+    float4 r = r0 + r1; \\\n\
+    r.x = act_func(r.x); \\\n\
+    h_tm = h_tm * input_scale + input_tail; \\\n\
+    float4 r_times_h = r * h1; \\\n\
+    float4 h = h0 + r_times_h; \\\n\
+    float4 z = z0 + z1; \\\n\
+    z.x = act_func(z.x); \\\n\
+    h = tanh_func(h.x); \\\n\
+    float4 dst = (1 - z ) * h + z * h_tm; \\\n\
+    dst = dst * output_scale + output_zp; \\\n\
+    uint4 result = convert_uint4_sat_rte(dst); \\\n\
+    write_imageui(output, coord_in.xy, result); \\\n\
+    write_imageui(hstate_out, coord_in.xy, result); \\\n\
+}\n\
+GRUCELL_ACTIVATION_U8_F32_U8(SIGMOID, sigmoid)\n\
+//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)\n\
+\n\
+#define GRUCELL_ACTIVATION_F32_F32_F32(act_name, act_func) \\\n\
+__kernel void grucell_reset_after_activation_F32_F32toF32_##act_name( \\\n\
+    __read_only  image2d_t        hstate_in, \\\n\
+    __read_only  image2d_t        input_z_conv, \\\n\
+    __read_only  image2d_t        input_r_conv, \\\n\
+    __read_only  image2d_t        input_h_conv, \\\n\
+    __read_only  image2d_t        hstate_z_conv, \\\n\
+    __read_only  image2d_t        hstate_r_conv, \\\n\
+    __read_only  image2d_t        hstate_h_conv, \\\n\
+    __write_only image2d_t        output, \\\n\
+    __write_only image2d_t        hstate_out, \\\n\
+    float input_scale, float input_tail, float output_scale, float output_zp) \\\n\
+{ \\\n\
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    float4  src0, src1, src2, src3; \\\n\
+    float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\
+    float4 r0 = read_imagef(input_r_conv, coord_in.xy); \\\n\
+    float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \\\n\
+    float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \\\n\
+    float4 h_tm = read_imagef(hstate_in, coord_in.xy); \\\n\
+    float4 h0 = read_imagef(input_h_conv, coord_in.xy); \\\n\
+    float4 z0 = read_imagef(input_z_conv, coord_in.xy); \\\n\
+    float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \\\n\
+ \\\n\
+    float4 r = r0 + r1; \\\n\
+    r.x = act_func(r.x); \\\n\
+    float4 r_times_h = r * h1; \\\n\
+    float4 h = h0 + r_times_h; \\\n\
+    float4 z = z0 + z1; \\\n\
+    z.x = act_func(z.x); \\\n\
+    h = tanh_func(h.x); \\\n\
+    float4 dst = (1 - z ) * h + z * h_tm; \\\n\
+    write_imagef(output, coord_in.xy, dst); \\\n\
+    write_imagef(hstate_out, coord_in.xy, dst); \\\n\
+}\n\
+\n\
+GRUCELL_ACTIVATION_F32_F32_F32(SIGMOID, sigmoid)\n\
+//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)\n\
+\n\
+#define GRUCELL_ACTIVATION_I32_F32_I32(act_name, act_func) \\\n\
+__kernel void grucell_reset_after_activation_I32_F32toI32_##act_name( \\\n\
+    __read_only  image2d_t        hstate_in, \\\n\
+    __read_only  image2d_t        input_z_conv, \\\n\
+    __read_only  image2d_t        input_r_conv, \\\n\
+    __read_only  image2d_t        input_h_conv, \\\n\
+    __read_only  image2d_t        hstate_z_conv, \\\n\
+    __read_only  image2d_t        hstate_r_conv, \\\n\
+    __read_only  image2d_t        hstate_h_conv, \\\n\
+    __write_only image2d_t        output, \\\n\
+    __write_only image2d_t        hstate_out, \\\n\
+    float input_scale, float input_tail, float output_scale, float output_zp) \\\n\
+{ \\\n\
+    int2 coord_in = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+    float4  src0, src1, src2, src3; \\\n\
+    float4 data_i_t, data_f_t, data_g_t, data_o_t, data_c_t; \\\n\
+    float4 r0 = read_imagef(input_r_conv, coord_in.xy); \\\n\
+    float4 r1 = read_imagef(hstate_r_conv, coord_in.xy); \\\n\
+    float4 h1 = read_imagef(hstate_h_conv, coord_in.xy); \\\n\
+    float4 h_tm = convert_float4(read_imagei(hstate_in, coord_in.xy)); \\\n\
+    float4 h0 = read_imagef(input_h_conv, coord_in.xy); \\\n\
+    float4 z0 = read_imagef(input_z_conv, coord_in.xy); \\\n\
+    float4 z1 = read_imagef(hstate_z_conv, coord_in.xy); \\\n\
+ \\\n\
+    float4 r = r0 + r1; \\\n\
+    r.x = act_func(r.x); \\\n\
+    h_tm = h_tm * input_scale + input_tail; \\\n\
+    float4 r_times_h = r * h1; \\\n\
+    float4 h = h0 + r_times_h; \\\n\
+    float4 z = z0 + z1; \\\n\
+    z.x = act_func(z.x); \\\n\
+    h = tanh_func(h.x); \\\n\
+    float4 dst = (1 - z ) * h + z * h_tm; \\\n\
+    dst = dst * output_scale + output_zp; \\\n\
+    int4 result = convert_int4_sat_rte(dst); \\\n\
+    write_imagei(output, coord_in.xy, result); \\\n\
+    write_imagei(hstate_out, coord_in.xy, result); \\\n\
+}\n\
+GRUCELL_ACTIVATION_I32_F32_I32(SIGMOID, sigmoid)\n\
+//GRUCELL_ACTIVATION_U8_F32_U8(HARD_SIGMOID, hard_sigmoid)"; /* end of grucell_reset_after_activation_cl*/
+
 static const char hswish_cl[] = "#define HSWISH_F32_F32_PROCESS() \\\n\
     float4 src, tmp, dst; \\\n\
     src   = read_imagef(input, coord); \\\n\
@@ -55827,14 +57723,20 @@ static const source_map_t evis_resource[] =
     {"gather_nd_mix_vx", gather_nd_mix_vx},
     {"get_matrix_vx", get_matrix_vx},
     {"group_normalization_f16_vx", group_normalization_f16_vx},
+    {"group_normalization_f16_scale_vx", group_normalization_f16_scale_vx},
     {"group_normalization_i16_vx", group_normalization_i16_vx},
+    {"group_normalization_i16_scale_vx", group_normalization_i16_scale_vx},
     {"group_normalization_i8_vx", group_normalization_i8_vx},
+    {"group_normalization_i8_scale_vx", group_normalization_i8_scale_vx},
     {"group_normalization_u8_vx", group_normalization_u8_vx},
     {"group_normalization_u8_f16_vx", group_normalization_u8_f16_vx},
     {"grucell_activation_vx", grucell_activation_vx},
     {"grucell_activation_sma_vx", grucell_activation_sma_vx},
+    {"grucell_activation_z_h_vx", grucell_activation_z_h_vx},
     {"grucell_cdnn_activation_vx", grucell_cdnn_activation_vx},
     {"grucell_cdnn_activation_u8_vx", grucell_cdnn_activation_u8_vx},
+    {"grucell_h_times_activation_r_vx", grucell_h_times_activation_r_vx},
+    {"grucell_reset_after_activation_vx", grucell_reset_after_activation_vx},
     {"hswish_vx", hswish_vx},
     {"instance_normalization_f16_vx", instance_normalization_f16_vx},
     {"instance_normalization_i16_vx", instance_normalization_i16_vx},
@@ -55923,6 +57825,7 @@ static const source_map_t evis_resource[] =
     {"pow_u8_vx", pow_u8_vx},
     {"pre_process_bgra_vx", pre_process_bgra_vx},
     {"pre_process_gray_vx", pre_process_gray_vx},
+    {"pre_process_gray_2_vx", pre_process_gray_2_vx},
     {"pre_process_gray_copy_vx", pre_process_gray_copy_vx},
     {"pre_process_nv12_scale_vx", pre_process_nv12_scale_vx},
     {"pre_process_nv12_scale_8bits_vx", pre_process_nv12_scale_8bits_vx},
@@ -55976,6 +57879,7 @@ static const source_map_t evis_resource[] =
     {"resize_bilinear_U8_vx", resize_bilinear_U8_vx},
     {"resize_bilinear_U8_half_pixel_centers_vx", resize_bilinear_U8_half_pixel_centers_vx},
     {"resize_bilinear_U8_opt_vx", resize_bilinear_U8_opt_vx},
+    {"resize_bilinear_nhwc_vx", resize_bilinear_nhwc_vx},
     {"resize_nearest_vx", resize_nearest_vx},
     {"scatter_nd_vx", scatter_nd_vx},
     {"scatter_nd_big_vx", scatter_nd_big_vx},
@@ -56027,6 +57931,9 @@ static const source_map_t cl_resource[] =
     {"group_normalization_u8_cl", group_normalization_u8_cl},
     {"grucell_activation_cl", grucell_activation_cl},
     {"grucell_activation_sma_cl", grucell_activation_sma_cl},
+    {"grucell_activation_z_h_cl", grucell_activation_z_h_cl},
+    {"grucell_h_times_activation_r_cl", grucell_h_times_activation_r_cl},
+    {"grucell_reset_after_activation_cl", grucell_reset_after_activation_cl},
     {"hswish_cl", hswish_cl},
     {"instance_normalization_f16_cl", instance_normalization_f16_cl},
     {"instance_normalization_f32_cl", instance_normalization_f32_cl},
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c b/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c
index d4cf2ae..1ce386a 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c
@@ -33,6 +33,7 @@
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
 static vsi_status op_compute
@@ -42,48 +43,15 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status;
-    vsi_size_t input_size[VSI_NN_MAX_DIM_NUM] = {0};
-    uint32_t dims = 0;
-    vx_tensor input = NULL, input0 = NULL;
-    vx_tensor output = NULL, output0 = NULL;
-    status = VSI_FAILURE;
+    vsi_status status = VSI_SUCCESS;
+    vsi_nn_kernel_node_t    n;
 
-    if (inputs[0]->attr.dim_num > 4)
+    n = vsi_nn_kernel_selector( self->graph, "abs", inputs, 1, outputs, 1, NULL );
+    if( n == NULL )
     {
-        input_size[0] = (int32_t)vsi_nn_GetElementNum(inputs[0]) /
-            inputs[0]->attr.size[inputs[0]->attr.dim_num - 1];
-        input_size[1] = inputs[0]->attr.size[inputs[0]->attr.dim_num - 1];
-        dims= 2;
-#ifdef VSI_40BIT_VA_SUPPORT
-        input = vxReshapeTensor(inputs[0]->t, input_size, dims);
-        output = vxReshapeTensor(outputs[0]->t, input_size, dims);
-#else
-        input = vxReshapeTensor(inputs[0]->t, (vx_int32*)input_size, (vx_uint32)dims);
-        output = vxReshapeTensor(outputs[0]->t, (vx_int32*)input_size, (vx_uint32)dims);
-#endif
-        input0 = input;
-        output0 = output;
-    }
-    else
-    {
-        input0 = inputs[0]->t;
-        output0 = outputs[0]->t;
+        status = VSI_FAILURE;
     }
 
-    self->n = vxLeakyReluLayer(
-        self->graph->g,
-        input0,
-        -1,
-        output0
-        );
-
-    if( NULL != self->n )
-    {
-        status = VSI_SUCCESS;
-    }
-    if (input)  vxReleaseTensor(&input);
-    if (output) vxReleaseTensor(&output);
     return status;
 } /* op_compute() */
 
@@ -152,4 +120,3 @@ DEF_OP_REG
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
index 2e67b83..70ff65e 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c
@@ -301,8 +301,8 @@ static vsi_status op_optimize
         reshape 3d input (xcn) --> 4d input (whcn)
         reshape 3d output(xcn) --> 4d output(whcn)
     */
-    shape[0] = 1;
-    shape[1] = inputs[0]->attr.size[0];
+    shape[0] = inputs[0]->attr.size[0];
+    shape[1] = 1;
     shape[2] = inputs[0]->attr.size[1];
     shape[3] = inputs[0]->attr.size[2];
     dim = 4;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
index 6a43126..5d16c2b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
@@ -183,7 +183,7 @@ static vsi_status op_init
     vsi_status status = VSI_SUCCESS;
     self->nn_param.clip.local2   =
     (vsi_nn_clip_lcl2_data *)malloc(sizeof(vsi_nn_clip_lcl2_data));
-    if (NULL == self->nn_param.reduce.local2)
+    if (NULL == self->nn_param.clip.local2)
     {
         return  VX_ERROR_NO_MEMORY;
     }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c
index 5c0b7ad..8c216ea 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c
@@ -32,6 +32,7 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_dtype_util.h"
+#include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
 static vsi_status op_compute
@@ -41,58 +42,31 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vx_tensor bias;
-    vsi_status status;
-    vx_nn_convolution_params_ext_t *p_ext = NULL;
-    vx_nn_convolution_params_ext2_t *p_ext2 = NULL;
-    vx_nn_convolution_params_ext2_t param_ext2;
-    memset( &param_ext2, 0, sizeof( vx_nn_convolution_params_ext2_t ) );
-    p_ext2 = &param_ext2;
-    p_ext = &p_ext2->ext;
-
-    status = VSI_FAILURE;
-
-    //set ext relative parameters
-    p_ext->khr.padding_x = self->nn_param.conv2d.pad[0];
-    p_ext->khr.padding_y = self->nn_param.conv2d.pad[2];
-    if (self->nn_param.conv2d.dilation[0] > 0)
-    {
-        p_ext->khr.dilation_x = self->nn_param.conv2d.dilation[0] - 1;
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t * param = NULL;
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_int32( param, "stride_w", self->nn_param.conv2d.stride[0] );
+    vsi_nn_kernel_param_add_int32( param, "stride_h", self->nn_param.conv2d.stride[1] );
+    vsi_nn_kernel_param_add_int32( param, "pad_h_front", self->nn_param.conv2d.pad[2] );
+    vsi_nn_kernel_param_add_int32( param, "pad_h_end", self->nn_param.conv2d.pad[3] );
+    vsi_nn_kernel_param_add_int32( param, "pad_w_front", self->nn_param.conv2d.pad[0] );
+    vsi_nn_kernel_param_add_int32( param, "pad_w_end", self->nn_param.conv2d.pad[1] );
+    vsi_nn_kernel_param_add_int32( param, "dilation_w", self->nn_param.conv2d.dilation[0] );
+    vsi_nn_kernel_param_add_int32( param, "dilation_h", self->nn_param.conv2d.dilation[1] );
+    vsi_nn_kernel_param_add_int32( param, "overflow_policy", self->vx_param.overflow_policy );
+    vsi_nn_kernel_param_add_int32( param, "rounding_policy", self->vx_param.rounding_policy );
+    vsi_nn_kernel_param_add_int32( param,
+            "down_scale_size_rounding", self->vx_param.down_scale_size_rounding );
+    if (self->nn_param.conv2d.multiplier != 0) {
+        vsi_nn_kernel_param_add_int32( param, "multiplier",
+            self->nn_param.conv2d.multiplier );
+        self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "depthwise_conv2d",
+            inputs, 3, outputs, 1, param );
+    } else {
+        self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "conv2d",
+            inputs, 3, outputs, 1, param );
     }
-    if (self->nn_param.conv2d.dilation[1] > 0)
-    {
-        p_ext->khr.dilation_y = self->nn_param.conv2d.dilation[1] - 1;
-    }
-    p_ext->khr.overflow_policy = self->vx_param.overflow_policy;
-    p_ext->khr.rounding_policy =  self->vx_param.rounding_policy;
-    p_ext->khr.down_scale_size_rounding = self->vx_param.down_scale_size_rounding;
-
-    p_ext->padding_x_right = self->nn_param.conv2d.pad[1];
-    p_ext->padding_y_bottom = self->nn_param.conv2d.pad[3];
-
-    //set ext2 relative parameters
-    p_ext2->depth_multiplier = self->nn_param.conv2d.multiplier;
-    p_ext2->stride_x = self->nn_param.conv2d.stride[0];
-    p_ext2->stride_y = self->nn_param.conv2d.stride[1];
-
-    if( inputs[2] == NULL )
-    {
-        bias = NULL;
-    }
-    else
-    {
-        bias = inputs[2]->t;
-    }
-
-    self->n = vxConvolutionLayer(
-        self->graph->g,
-        inputs[0]->t,
-        inputs[1]->t,
-        bias,
-        (vx_nn_convolution_params_t *)p_ext2,
-        sizeof( vx_nn_convolution_params_ext2_t ),
-        outputs[0]->t
-        );
+    vsi_nn_kernel_param_release( &param );
 
    if( NULL != self->n )
     {
@@ -306,6 +280,20 @@ static vsi_bool op_check
             IO_TYPE(D_F32,       D_BF16,         D_F32,           D_BF16)
             IO_TYPE(D_F32,       D_BF16,         D_F32,           D_F32)
 
+            IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U4|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I4|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_U4|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_I4|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_ASYM,    D_U4|Q_ASYM)
+            IO_TYPE(D_U4|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_ASYM,    D_I4|Q_ASYM)
+            IO_TYPE(D_I4|Q_ASYM, D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_I4|Q_ASYM)
+            IO_TYPE(D_I4|Q_ASYM, D_I8|Q_ASYM,    D_I32|Q_ASYM,    D_U4|Q_ASYM)
+            IO_TYPE(D_I4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I4|Q_ASYM)
+            IO_TYPE(D_I4|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U4|Q_ASYM)
+            IO_TYPE(D_I4|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I4|Q_ASYM)
+            IO_TYPE(D_I4|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U4|Q_ASYM)
+            IO_TYPE(D_I4|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_I4|Q_DFP)
+
         END_IO_TYPE_DECL(CONV2D)
         ret = VALIDATE_OP_IO_TYPES(CONV2D, self, inputs, self->input.num, outputs, self->output.num);
         if(!ret) {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c
index af34411..327b949 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c
@@ -56,7 +56,7 @@ static vsi_nn_internal_tensor_t * reshape_cell_out
     output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
 
     /* reshape cell_out [w,h,c,n] to [w,h,c,1,n] */
-    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 );
+    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
     reshape_cell_size = vsi_nn_internal_new_node_param(curr,
         VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
     reshape_cell_size[0] = cell_out->attr.size[0];
@@ -64,8 +64,8 @@ static vsi_nn_internal_tensor_t * reshape_cell_out
     reshape_cell_size[2] = cell_out->attr.size[2];
     reshape_cell_size[3] = 1;
     reshape_cell_size[4] = cell_out->attr.size[3];
-    curr->node->nn_param.reshape.size = reshape_cell_size;
-    curr->node->nn_param.reshape.dim_num = 5;
+    curr->node->nn_param.reshape2.size = reshape_cell_size;
+    curr->node->nn_param.reshape2.dim_num = 5;
 
     curr->inputs[0] = cell_out;
     curr->outputs[0] = output_tensor->t;
@@ -90,15 +90,15 @@ static vsi_nn_internal_tensor_t * reshape_split_out
     output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
 
     /* reshape [w,h,c,t,n] to [w,h,c,n] */
-    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 );
+    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
     reshape_split_size = vsi_nn_internal_new_node_param(curr,
         VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
     reshape_split_size[0] = split_out->attr.size[0];
     reshape_split_size[1] = split_out->attr.size[1];
     reshape_split_size[2] = split_out->attr.size[2];
     reshape_split_size[3] = split_out->attr.size[4];
-    curr->node->nn_param.reshape.size = reshape_split_size;
-    curr->node->nn_param.reshape.dim_num = 4;
+    curr->node->nn_param.reshape2.size = reshape_split_size;
+    curr->node->nn_param.reshape2.dim_num = 4;
 
     curr->inputs[0] = split_out;
     curr->outputs[0] = output_tensor->t;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c
new file mode 100644
index 0000000..35bf275
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv3d.c
@@ -0,0 +1,396 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+typedef struct _conv3d_local_data_t {
+    int32_t placeholder;
+} conv3d_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (3)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t * param = NULL;
+    param = vsi_nn_kernel_param_create();
+
+#define MAP_PARAM(type_name, value) {\
+    vsi_nn_kernel_param_add_int32( param, type_name, value); \
+    }
+
+    MAP_PARAM("stride_w",self->nn_param.conv3d.stride[0]);
+    MAP_PARAM("stride_h",self->nn_param.conv3d.stride[1]);
+    MAP_PARAM("stride_d",self->nn_param.conv3d.stride[2]);
+
+    MAP_PARAM("pad_left",self->nn_param.conv3d.pad[0]);
+    MAP_PARAM("pad_right",self->nn_param.conv3d.pad[1]);
+    MAP_PARAM("pad_top",self->nn_param.conv3d.pad[2]);
+    MAP_PARAM("pad_bottom",self->nn_param.conv3d.pad[3]);
+    MAP_PARAM("pad_front",self->nn_param.conv3d.pad[4]);
+    MAP_PARAM("pad_end",self->nn_param.conv3d.pad[5]);
+
+    MAP_PARAM("depth_multiplier", self->nn_param.conv3d.multiplier);
+    MAP_PARAM("overflow_policy",self->vx_param.overflow_policy);
+    MAP_PARAM("rounding_policy",self->vx_param.rounding_policy);
+    MAP_PARAM("down_scale_size_rounding",self->vx_param.down_scale_size_rounding);
+
+    if ( self->nn_param.conv3d.dilation[0] *
+         self->nn_param.conv3d.dilation[1] *
+         self->nn_param.conv3d.dilation[2] > 1)
+    {
+        VSILOGE("conv3d could not support dilation > 1\n");
+        return status;
+    }else
+    {
+        MAP_PARAM("dilation_w",self->nn_param.conv3d.dilation[0]);
+        MAP_PARAM("dilation_h",self->nn_param.conv3d.dilation[1]);
+        MAP_PARAM("dilation_d",self->nn_param.conv3d.dilation[2]);
+    }
+#undef MAP_PARAM
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "conv3d",
+            inputs, 3, outputs, 1, param );
+    if( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    vsi_nn_kernel_param_release( &param );
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_bool ret = FALSE;
+
+    /* Check fl and scale*/
+    ret = vsi_nn_QuantCheck(inputs[0], inputs[1], inputs[2]);
+
+    if(ret) {
+         /* check inputs outputs data type */
+        BEGIN_IO_TYPE_DECL(CONV3D, 3, 1)
+
+            IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_F16)
+
+            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP)
+            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP)
+            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_F16)
+
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16)
+
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_ASYM, D_U8|Q_ASYM)
+
+            IO_TYPE(D_BF16, D_BF16, D_F32, D_F32)
+            IO_TYPE(D_BF16, D_BF16, D_F32, D_BF16)
+            IO_TYPE(D_F16, D_F16, D_F32, D_F16)
+            IO_TYPE(D_F16, D_F16, D_F32, D_F32)
+
+            IO_TYPE(D_F32, D_F32, D_F32, D_F32)
+
+            IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_I32|Q_SYM, D_I8|Q_SYM)
+
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM)
+
+            /* IO_TYPE(INPUT, WEIGHT, NULL, OUTPUT) */
+            IO_TYPE(D_F32, D_F32, D_NONE, D_F32)
+
+            IO_TYPE(D_F16, D_F16, D_NONE, D_F16)
+
+            IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_F16)
+
+            IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_I16|Q_DFP)
+
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_F16)
+
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_U8|Q_ASYM)
+
+            IO_TYPE(D_BF16, D_BF16, D_NONE, D_F32)
+            IO_TYPE(D_BF16, D_BF16, D_NONE, D_BF16)
+
+            IO_TYPE(D_I8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM, D_I8|Q_SYM, D_NONE, D_I8|Q_SYM)
+
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM)
+
+            /* HW 9.0 */
+            IO_TYPE(D_F32, D_BF16, D_F32, D_BF16)
+            IO_TYPE(D_F32, D_BF16, D_NONE, D_BF16)
+            /* HW 9.0.1 */
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_NONE,          D_I8|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_NONE,          D_I16|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_NONE,          D_BF16)
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_NONE,          D_F32)
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I8|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I16|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_BF16)
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F32)
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_I8|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_I16|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_F16)
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_BF16)
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_NONE,          D_F32)
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I8|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F16)
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
+            IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
+
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_I8|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_I16|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_F16)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_BF16)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_NONE,          D_F32)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_I8|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_I16|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_F16)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_BF16)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I32|Q_DFP,     D_F32)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_I8|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_I16|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_F16)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_BF16)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP,     D_I64|Q_DFP,     D_F32)
+
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_I8|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_I16|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_F16)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_BF16)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_NONE,          D_F32)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I8|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_I16|Q_DFP)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F16)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_BF16)
+            IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC,  D_I32|Q_SYM_PC,  D_F32)
+
+            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_I16|Q_DFP)
+            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_F16)
+            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_BF16)
+            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_NONE,          D_F32)
+            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_NONE,          D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_NONE,          D_I16|Q_DFP)
+            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_NONE,          D_BF16)
+            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_NONE,          D_F32)
+            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_I16|Q_DFP)
+            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F16)
+            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_BF16)
+            IO_TYPE(D_I8|Q_DFP,  D_U8|Q_ASYM,    D_I32|Q_ASYM,    D_F32)
+            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_I16|Q_DFP)
+            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_BF16)
+            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I32|Q_DFP,     D_F32)
+            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_U8|Q_ASYM)
+            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_I16|Q_DFP)
+            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_BF16)
+            IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,     D_I64|Q_DFP,     D_F32)
+
+        END_IO_TYPE_DECL(CONV3D)
+        ret = VALIDATE_OP_IO_TYPES(CONV3D, self, inputs, self->input.num, outputs, self->output.num);
+        if(!ret) {
+            char* desc = generate_op_io_types_desc(inputs,
+                    self->input.num, outputs, self->output.num);
+            VSILOGE("Inputs/Outputs data type not support: %s", desc);
+            destroy_op_io_types_desc(desc);
+            return FALSE;
+        }
+
+        /* check parameters */
+        if(inputs[1]->attr.size[0] * inputs[1]->attr.size[1] > 6400) {
+            VSILOGE("Kernel size should <= 6400.");
+            return FALSE;
+        }
+    }
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_conv3d_param *nn_param;
+    vsi_size_t i, pad[_cnt_of_array(self->nn_param.conv3d.pad)] = {0};
+    for(i = 0; i < _cnt_of_array(self->nn_param.conv3d.pad); i++)
+    {
+        pad[i] = self->nn_param.conv3d.pad[i];
+    }
+#ifdef VX_CONVERT_POLICY_WRAP_ENABLE
+    if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 )
+    {
+        self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
+    }
+#endif
+
+    nn_param = &self->nn_param.conv3d;
+
+    vsi_nn_compute_padding_3d(
+        inputs[0]->attr.size,
+        inputs[1]->attr.size,
+        (uint32_t *)self->nn_param.conv3d.stride,
+        (uint32_t *)self->nn_param.conv3d.dilation,
+        self->nn_param.conv3d.pad_type,
+        pad
+    );
+    for(i = 0; i < _cnt_of_array(self->nn_param.conv3d.pad); i++)
+    {
+        self->nn_param.conv3d.pad[i] = (uint32_t)pad[i];
+    }
+
+    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize
+            (
+            inputs[0]->attr.size[0],
+            inputs[1]->attr.size[0],
+            (vx_uint32 *)&nn_param->pad[0],
+            nn_param->stride[0],
+            nn_param->dilation[0],
+            VSI_NN_ROUND_FLOOR
+            );
+        outputs[0]->attr.size[1] = vsi_nn_ComputeFilterSize
+            (
+            inputs[0]->attr.size[1],
+            inputs[1]->attr.size[1],
+            (vx_uint32 *)&nn_param->pad[2],
+            nn_param->stride[1],
+            nn_param->dilation[1],
+            VSI_NN_ROUND_FLOOR
+            );
+        outputs[0]->attr.size[2] = vsi_nn_ComputeFilterSize
+            (
+            inputs[0]->attr.size[2],
+            inputs[1]->attr.size[2],
+            (vx_uint32 *)&nn_param->pad[4],
+            nn_param->stride[2],
+            nn_param->dilation[2],
+            VSI_NN_ROUND_FLOOR
+            );
+        if(self->nn_param.conv3d.weights > 0)
+        {
+            outputs[0]->attr.size[3] = self->nn_param.conv3d.weights;
+        }
+        else if(self->nn_param.conv3d.multiplier > 0)
+        {
+            outputs[0]->attr.size[3] = inputs[0]->attr.size[3] * self->nn_param.conv3d.multiplier;
+        }
+        else
+        {
+            outputs[0]->attr.size[3] = inputs[1]->attr.size[4];
+        }
+        outputs[0]->attr.size[4] = inputs[0]->attr.size[4];
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+    }
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t* self
+    )
+{
+    /* TODO
+    //self->nn_param.conv3d.local = \
+    //    (conv3d_local_data_t*)malloc(sizeof(conv3d_local_data_t));
+    */
+
+    return VSI_SUCCESS;
+} /* op_init() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t* self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    status = vsi_nn_op_common_deinit(self);
+
+    /* TODO
+    //vsi_nn_safe_free(self->nn_param.conv3d.local);
+    */
+
+    return status;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ CONV3D,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu.c
index 6a6647f..85d35df 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu.c
@@ -187,24 +187,53 @@ static vsi_status op_optimize
             p_opt = &opt;
         }
 
-        inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2(
-            VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER,
-            4,
 #ifdef VSI_40BIT_VA_SUPPORT
-            inputs[0]->attr.size,
-            outputs[0]->attr.size,
-            outputs[0]->attr.size,
+        {
+            vx_size size_input0[VSI_NN_MAX_DIM_NUM];
+            vx_size size_output0[VSI_NN_MAX_DIM_NUM];
+            size_t i = 0;
+            for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+            {
+                size_input0[i] = (vx_size)inputs[0]->attr.size[i];
+                size_output0[i] = (vx_size)outputs[0]->attr.size[i];
+            }
+            inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2(
+                VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER,
+                4,
+                size_input0,
+                size_output0,
+                size_output0,
+                outputs[0]->attr.dtype.vx_type,
+                (vx_nn_convolution_relu_pooling_params_t *)&p,
+                sizeof(p),
+                p_opt,
+                inputs[1]->t, inputs[2]->t
+                );
+        }
 #else
-            (vx_uint32*)inputs[0]->attr.size,
-            (vx_uint32*)outputs[0]->attr.size,
-            (vx_uint32*)outputs[0]->attr.size,
+        {
+            uint32_t size_u32_input0[VSI_NN_MAX_DIM_NUM];
+            uint32_t size_u32_output0[VSI_NN_MAX_DIM_NUM];
+            size_t i = 0;
+            for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+            {
+                size_u32_input0[i] = (uint32_t)inputs[0]->attr.size[i];
+                size_u32_output0[i] = (uint32_t)outputs[0]->attr.size[i];
+            }
+            inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2(
+                VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER,
+                4,
+                size_u32_input0,
+                size_u32_output0,
+                size_u32_output0,
+                outputs[0]->attr.dtype.vx_type,
+                (vx_nn_convolution_relu_pooling_params_t *)&p,
+                sizeof(p),
+                p_opt,
+                inputs[1]->t, inputs[2]->t
+                );
+        }
 #endif
-            outputs[0]->attr.dtype.vx_type,
-            (vx_nn_convolution_relu_pooling_params_t *)&p,
-            sizeof(p),
-            p_opt,
-            inputs[1]->t, inputs[2]->t
-            );
         vsi_nn_DeinitConvReluPoolParameter( &p );
     }
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c
index f49ef3b..48b43b5 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c
@@ -215,18 +215,30 @@ static vsi_status op_optimize
         }
 
 #ifdef VSI_40BIT_VA_SUPPORT
-        inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2(
-            VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER,
-            4,
-            inputs[0]->attr.size,
-            pconv_out->attr.size,
-            outputs[0]->attr.size,
-            outputs[0]->attr.dtype.vx_type,
-            (vx_nn_convolution_relu_pooling_params_t *)&p,
-            sizeof(p),
-            p_opt,
-            inputs[1]->t, inputs[2]->t
-            );
+        {
+            vx_size size_input0[VSI_NN_MAX_DIM_NUM];
+            vx_size size_pconv_out[VSI_NN_MAX_DIM_NUM];
+            vx_size size_output0[VSI_NN_MAX_DIM_NUM];
+            size_t i = 0;
+            for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+            {
+                size_input0[i] = (vx_size)inputs[0]->attr.size[i];
+                size_pconv_out[i] = (vx_size)pconv_out->attr.size[i];
+                size_output0[i] = (vx_size)outputs[0]->attr.size[i];
+            }
+            inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2(
+                VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER,
+                4,
+                size_input0,
+                size_pconv_out,
+                size_output0,
+                outputs[0]->attr.dtype.vx_type,
+                (vx_nn_convolution_relu_pooling_params_t *)&p,
+                sizeof(p),
+                p_opt,
+                inputs[1]->t, inputs[2]->t
+                );
+        }
 #else
         {
             uint32_t size_u32_input0[VSI_NN_MAX_DIM_NUM];
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
index d91a7e6..fec61bb 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
@@ -104,16 +104,11 @@ static vsi_status op_optimize
     {
         if(NULL == inputs[0]->t && NULL != outputs[0]->t)
         {
-#ifdef VSI_40BIT_VA_SUPPORT
-            inputs[0]->t = vxReshapeTensor(outputs[0]->t,
-                inputs[0]->attr.size, inputs[0]->attr.dim_num);
-#else
-            inputs[0]->t = vxReshapeTensor(outputs[0]->t,
-                (vx_int32*)inputs[0]->attr.size, inputs[0]->attr.dim_num);
-#endif
+            inputs[0]->t = vsi_nn_safe_reshape_tensor(outputs[0]->t,
+                (void*)inputs[0]->attr.size, (vsi_size_t)inputs[0]->attr.dim_num, sizeof(inputs[0]->attr.size[0]));
             if( inputs[0]->t == NULL )
             {
-                VSILOGE("Call vxReshapeTensor fail");
+                VSILOGE("Call vsi_nn_safe_reshape_tensor fail");
                 return VSI_FAILURE;
             }
             self->nn_param.dataconvert.lcl_data->use_reshape = TRUE;
@@ -123,16 +118,11 @@ static vsi_status op_optimize
     {
         if(NULL == outputs[0]->t && NULL != inputs[0]->t)
         {
-#ifdef VSI_40BIT_VA_SUPPORT
-            outputs[0]->t = vxReshapeTensor(inputs[0]->t,
-                outputs[0]->attr.size, outputs[0]->attr.dim_num);
-#else
-            outputs[0]->t = vxReshapeTensor(inputs[0]->t,
-                (vx_int32*)outputs[0]->attr.size, outputs[0]->attr.dim_num);
-#endif
+            outputs[0]->t = vsi_nn_safe_reshape_tensor(inputs[0]->t,
+                (void*)outputs[0]->attr.size, (vsi_size_t)outputs[0]->attr.dim_num, sizeof(outputs[0]->attr.size[0]));
             if( outputs[0]->t == NULL )
             {
-                VSILOGE("Call vxReshapeTensor fail");
+                VSILOGE("Call vsi_nn_safe_reshape_tensor fail");
                 return VSI_FAILURE;
             }
             self->nn_param.dataconvert.lcl_data->use_reshape = TRUE;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
index a82f521..2373688 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
@@ -53,6 +53,7 @@ static vsi_status _eltwise_op_compute
     vsi_bool ret = TRUE;
     vx_bool doShapeOptimized = TRUE;
     vsi_nn_kernel_param_t * param = NULL;
+    vsi_nn_context_t   ctx = NULL;
 
     if( NULL == self )
     {
@@ -60,9 +61,13 @@ static vsi_status _eltwise_op_compute
     }
     status = VSI_FAILURE;
 
+    ctx = self->graph->ctx;
+
     if ( strcmp(kernel_name, "sub") == 0
       || strcmp(kernel_name, "add") == 0
-      || strcmp(kernel_name, "mul") == 0 )
+      || strcmp(kernel_name, "mul") == 0
+      || (strcmp(kernel_name, "maximum") == 0 && ctx->config.support_stream_processor)
+      || (strcmp(kernel_name, "minimum") == 0 && ctx->config.support_stream_processor))
     {
         doShapeOptimized = FALSE;
 
@@ -184,7 +189,6 @@ vsi_bool vsi_nn_op_eltwise_setup
     return ret;
 } /* vsi_nn_op_eltwise_setup() */
 
-
 static vsi_bool op_check_minimum
     (
     vsi_nn_node_t * self,
@@ -322,7 +326,6 @@ static vsi_bool op_check_pow
     return TRUE;
 } /* op_check() */
 
-
 static vsi_bool op_check_add
     (
     vsi_nn_node_t * self,
@@ -457,9 +460,6 @@ static vsi_bool op_check_sub
     return ret;
 } /* op_check() */
 
-
-
-
 static vsi_bool op_check_div
     (
     vsi_nn_node_t * self,
@@ -518,7 +518,6 @@ static vsi_bool op_check_div
     return TRUE;
 } /* op_check() */
 
-
 static vsi_bool op_check_mul
     (
     vsi_nn_node_t * self,
@@ -657,7 +656,6 @@ DEF_ELEMENT_WISE_OP( DIVIDE, div );
 DEF_ELEMENT_WISE_OP( MULTIPLY, mul );
 DEF_ELEMENT_WISE_OP( POW, pow );
 
-
 #undef DEF_ELEMENT_WISE_OP
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
index b2a162f..d8ae9d9 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c
@@ -46,6 +46,7 @@ static vsi_status _eltwise_unary_op_compute
 {
     vsi_status status = VSI_FAILURE;
     float alpha = 0;
+    float beta = 0;
     vsi_nn_kernel_param_t * param = NULL;
 
     if( NULL == self )
@@ -54,8 +55,17 @@ static vsi_status _eltwise_unary_op_compute
     }
     param = vsi_nn_kernel_param_create();
 
-    alpha = self->nn_param.elu.alpha;
+    if (strcmp(kernel_name, "elu") == 0)
+    {
+        alpha = self->nn_param.elu.alpha;
+    }
+    else
+    {
+        alpha = self->nn_param.hard_sigmoid.alpha;
+        beta = self->nn_param.hard_sigmoid.beta;
+    }
     vsi_nn_kernel_param_add_float32( param, "alpha", alpha );
+    vsi_nn_kernel_param_add_float32( param, "beta", beta );
 
     // TODO: This optimzie is a hack for gpu path,
     // it should be moved to gpu kernel setup.
@@ -158,7 +168,8 @@ static vsi_bool op_check
         IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP)
         IO_TYPE(D_I16|Q_DFP, D_F16)
     END_IO_TYPE_DECL(ELTWISE_UNARY)
-    if(!VALIDATE_OP_IO_TYPES(ELTWISE_UNARY, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(ELTWISE_UNARY, self, inputs, self->input.num, outputs, self->output.num))
+    {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
@@ -169,15 +180,22 @@ static vsi_bool op_check
     return TRUE;
 } /* op_check() */
 
-static vsi_status op_init
+static vsi_status _eltwise_unary_op_init
     (
+    const char * kernel_name,
     vsi_nn_node_t * self
     )
 {
-    if (vsi_nn_compareVersion(self->graph, 1, 1, 29) == -1)
+    if (vsi_nn_compareVersion(self->graph, 1, 1, 29) == -1 &&
+        strcmp(kernel_name, "elu") == 0)
     {
         self->nn_param.elu.alpha = 1;
     }
+    else if (strcmp(kernel_name, "hard_sigmoid") == 0)
+    {
+        self->nn_param.hard_sigmoid.alpha = 0.2f;
+        self->nn_param.hard_sigmoid.beta = 0.5f;
+    }
 
     return VSI_SUCCESS;
 } /* op_init() */
@@ -196,7 +214,15 @@ extern "C" {
     { \
         return _eltwise_unary_op_compute( ""#kernel_name, self, inputs, outputs ); \
     } \
-DEF_OP_REG(name, op_init, op_compute_##kernel_name, vsi_nn_op_common_deinit, op_check, op_setup, NULL, 1, 1)
+    static vsi_status op_init_##kernel_name \
+        ( \
+        vsi_nn_node_t * self \
+        ) \
+    { \
+        return _eltwise_unary_op_init( ""#kernel_name, self ); \
+    } \
+DEF_OP_REG(name, op_init_##kernel_name, op_compute_##kernel_name, \
+    vsi_nn_op_common_deinit, op_check, op_setup, NULL, 1, 1)
 
 DEF_ELEMENT_WISE_UNARY_OP( SIN, sin );
 DEF_ELEMENT_WISE_UNARY_OP( EXP, exp );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c b/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c
index e55e456..b3ae7cf 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c
@@ -32,6 +32,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_constraint_check.h"
 
@@ -52,11 +53,7 @@ static void _reshape_tensor
         attr.size[2] = input->attr.size[1];
         attr.dim_num = 3;
     }
-#ifdef VSI_40BIT_VA_SUPPORT
-    *output = vxReshapeTensor( input->t, attr.size, attr.dim_num );
-#else
-    *output = vxReshapeTensor( input->t, (vx_int32*)attr.size, attr.dim_num );
-#endif
+    *output = vsi_nn_safe_reshape_tensor( input->t, (void*)attr.size, (vsi_size_t)attr.dim_num , sizeof(attr.size[0]));
 }
 
 static vsi_status op_compute
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c
index e17292b..325e9c1 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c
@@ -98,23 +98,28 @@ static vsi_bool op_check
     )
 {
     BEGIN_IO_TYPE_DECL(FLOORDIV, 2, 1)
-        IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM)
-        IO_TYPE(D_F16, D_F16, D_I16|Q_DFP)
-        IO_TYPE(D_F16, D_F16, D_I8|Q_DFP)
-        IO_TYPE(D_F16,  D_F16,  D_F16)
-        IO_TYPE(D_F32,  D_F32,  D_F32)
-        IO_TYPE(D_I32,  D_I32,  D_I32)
-        IO_TYPE(D_I32,  D_I32,  D_U8|Q_ASYM)
-        IO_TYPE(D_BF16, D_BF16, D_BF16)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM)
-        IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_I8|Q_DFP)
-        IO_TYPE(D_I8|Q_DFP,  D_I8|Q_DFP,  D_F16)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP)
-        IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16)
-        IO_TYPE(D_U8|Q_ASYM, D_I32,       D_U8|Q_ASYM)
+        IO_TYPE(D_F16,          D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_F16,          D_F16,          D_I16|Q_DFP)
+        IO_TYPE(D_F16,          D_F16,          D_I8|Q_DFP)
+        IO_TYPE(D_F16,          D_F16,          D_F16)
+        IO_TYPE(D_F32,          D_F32,          D_F32)
+        IO_TYPE(D_I32,          D_I32,          D_I32)
+        IO_TYPE(D_I32,          D_I32,          D_U8|Q_ASYM)
+        IO_TYPE(D_BF16,         D_BF16,         D_BF16)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,    D_U8|Q_ASYM,    D_F16)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,     D_I8|Q_DFP,     D_F16)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_F16)
+        IO_TYPE(D_U8|Q_ASYM,    D_I32,          D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,    D_I32,          D_U8|Q_ASYM)
+        IO_TYPE(D_I32,          D_I16|Q_DFP,    D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,    D_I16|Q_DFP,    D_U8|Q_ASYM)
+        IO_TYPE(D_I16,          D_I32,          D_I32)
     END_IO_TYPE_DECL(FLOORDIV)
-    if(!VALIDATE_OP_IO_TYPES(FLOORDIV, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(FLOORDIV, self, inputs, self->input.num, outputs, self->output.num))
+    {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c
index ab84c5a..812c7df 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c
@@ -73,11 +73,7 @@ static vsi_status op_compute
     input_size[0] = num_fc;
     input_size[1] = num_no_fc;
     dims= 2;
-#ifdef VSI_40BIT_VA_SUPPORT
-    input = vxReshapeTensor(inputs[0]->t, input_size, dims);
-#else
-    input = vxReshapeTensor(inputs[0]->t, (vx_int32*)input_size, dims);
-#endif
+    input = vsi_nn_safe_reshape_tensor(inputs[0]->t, (void*)input_size, (vsi_size_t)dims, sizeof(input_size[0]));
 
     weight = inputs[1]->t;
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c
index 0cdd29d..1bbb5ee 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c
@@ -95,62 +95,25 @@ static vsi_status op_compute
     input_size[0] = num_fc;
     input_size[1] = num_no_fc;
     dims= 2;
-#ifdef VSI_40BIT_VA_SUPPORT
-    input = vxReshapeTensor(inputs[0]->t, input_size, dims);
-#else
-    {
-        int32_t input_size_32bit[VSI_NN_MAX_DIM_NUM] = {0};
-        for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
-        {
-            input_size_32bit[i] =  (int32_t)input_size[i];
-        }
-        input = vxReshapeTensor(inputs[0]->t, input_size_32bit, (uint32_t)dims);
-    }
-#endif
+    input = vsi_nn_safe_reshape_tensor(inputs[0]->t, (void*)input_size, (vsi_size_t)dims, sizeof(input_size[0]));
 
     weights_size[0] = num_fc;
     weights_size[1] = ofm;
     dims= 2;
-#ifdef VSI_40BIT_VA_SUPPORT
-    weight = vxReshapeTensor(inputs[1]->t, weights_size, dims);
-#else
-    {
-        int32_t weight_size_32bit[VSI_NN_MAX_DIM_NUM] = {0};
-        for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
-        {
-            weight_size_32bit[i] = (int32_t)weight_size_32bit[i];
-        }
-        weight = vxReshapeTensor(inputs[1]->t, weight_size_32bit, (uint32_t)dims);
-    }
-#endif
+    weight = vsi_nn_safe_reshape_tensor(inputs[1]->t, (void*)weights_size, (vsi_size_t)dims, sizeof(weights_size[0]));
 
     if( inputs[2] != NULL )
     {
         bias_size[0] = ofm;
         bias_size[1] = 1;
         dims= 2;
-#ifdef VSI_40BIT_VA_SUPPORT
-        bias = vxReshapeTensor(inputs[2]->t, bias_size, dims);
-#else
-        {
-            int32_t bias_size_32bit[VSI_NN_MAX_DIM_NUM] = {0};
-            for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
-            {
-                bias_size_32bit[i] =  (int32_t)bias_size[i];
-            }
-            bias = vxReshapeTensor(inputs[2]->t, bias_size_32bit, (uint32_t)dims);
-        }
-#endif
+        bias = vsi_nn_safe_reshape_tensor(inputs[2]->t, (void*)bias_size, (vsi_size_t)dims, sizeof(bias_size[0]));
     }
 
     output_size[0] = ofm;
     output_size[1] = num_no_fc;
     dims= 2;
-#ifdef VSI_40BIT_VA_SUPPORT
-    output = vxReshapeTensor(outputs[0]->t, output_size, dims);
-#else
-    output = vxReshapeTensor(outputs[0]->t, (vx_int32*)output_size, (uint32_t)dims);
-#endif
+    output = vsi_nn_safe_reshape_tensor(outputs[0]->t, (void*)output_size, (vsi_size_t)dims, sizeof(output_size[0]));
 
     self->n = vxFullyConnectedLayer(
         self->graph->g,
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c
index 8766867..cf19eeb 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c
@@ -267,23 +267,51 @@ static vsi_bool op_setup
         opt.num_of_output_dims = outputs[0]->attr.dim_num;
         p_opt = &opt;
 
-        inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors3(
-            VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER,
 #ifdef VSI_40BIT_VA_SUPPORT
-            inputs[0]->attr.size,
-            outputs[0]->attr.size,
-            outputs[0]->attr.size,
+        {
+            vx_size size_input0[VSI_NN_MAX_DIM_NUM];
+            vx_size size_output0[VSI_NN_MAX_DIM_NUM];
+            size_t i = 0;
+            for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+            {
+                size_input0[i] = (vx_size)inputs[0]->attr.size[i];
+                size_output0[i] = (vx_size)outputs[0]->attr.size[i];
+            }
+            inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors3(
+                VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER,
+                size_input0,
+                size_output0,
+                size_output0,
+                &p,
+                sizeof(p),
+                (vx_weights_biases_parameter_optimizations_t *)p_opt,
+                sizeof(opt),
+                inputs[1]->t, inputs[2]->t
+                );
+        }
 #else
-            (vx_uint32*)inputs[0]->attr.size,
-            (vx_uint32*)outputs[0]->attr.size,
-            (vx_uint32*)outputs[0]->attr.size,
+        {
+            uint32_t size_u32_input0[VSI_NN_MAX_DIM_NUM];
+            uint32_t size_u32_output0[VSI_NN_MAX_DIM_NUM];
+            size_t i = 0;
+            for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+            {
+                size_u32_input0[i] = (uint32_t)inputs[0]->attr.size[i];
+                size_u32_output0[i] = (uint32_t)outputs[0]->attr.size[i];
+            }
+            inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors3(
+                VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER,
+                size_u32_input0,
+                size_u32_output0,
+                size_u32_output0,
+                &p,
+                sizeof(p),
+                (vx_weights_biases_parameter_optimizations_t *)p_opt,
+                sizeof(opt),
+                inputs[1]->t, inputs[2]->t
+                );
+        }
 #endif
-            &p,
-            sizeof(p),
-            (vx_weights_biases_parameter_optimizations_t *)p_opt,
-            sizeof(opt),
-            inputs[1]->t, inputs[2]->t
-            );
         if( p.pad_const )
         {
             vxReleaseScalar( &p.pad_const );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c
index 2776150..6cf086c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c
@@ -54,17 +54,17 @@ static vsi_status op_compute
     vsi_size_t *input_size = inputs[0]->attr.size;
     vsi_size_t dims_num = inputs[0]->attr.dim_num;
 
-    if(inputs[1]->attr.dim_num > 1)
+    if (inputs[1]->attr.dim_num > 1)
     {
         coord_dim = inputs[1]->attr.size[0];
     }
-    if( coord_dim > 3 )
+    if (coord_dim > 4 || (coord_dim > 3 && input_size[dims_num - 1] != 1))
     {
         CHECK_STATUS(status);
         return status;
     }
 
-    param =vsi_nn_kernel_param_create();
+    param = vsi_nn_kernel_param_create();
 
     for(i = 0; i < dims_num - coord_dim; ++i)
     {
@@ -74,13 +74,13 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_int32( param, "block_size", (int32_t)block_size );
     vsi_nn_kernel_param_add_int32( param, "coord_dim", (int32_t)coord_dim );
     n = vsi_nn_kernel_selector( self->graph, "gather_nd", inputs, 2, outputs, 1, param );
-    if( n != NULL )
+    if ( n != NULL )
     {
         self->n = (vx_node)n;
         status = VSI_SUCCESS;
     }
 
-    if(param != NULL)
+    if (param != NULL)
     {
         vsi_nn_kernel_param_release( &param );
     }
@@ -110,7 +110,7 @@ static vsi_bool op_check
         IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP)
         IO_TYPE(D_I16|Q_DFP, D_I32, D_F16)
     END_IO_TYPE_DECL(GATHER_ND)
-    if(!VALIDATE_OP_IO_TYPES(GATHER_ND, self, inputs, self->input.num, outputs, self->output.num)) {
+    if (!VALIDATE_OP_IO_TYPES(GATHER_ND, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
                 self->input.num, outputs, self->output.num);
         VSILOGE("Inputs/Outputs data type not support: %s", desc);
@@ -131,10 +131,10 @@ static vsi_bool op_setup
     /* TODO: Add code to comput outputs' shape. */
     vsi_size_t i = 0;
 
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         vsi_size_t j = 0, coord_dim = 1;
-        if(inputs[1]->attr.dim_num > 1)
+        if (inputs[1]->attr.dim_num > 1)
         {
             coord_dim = inputs[1]->attr.size[0];
         }
@@ -147,7 +147,7 @@ static vsi_bool op_setup
         {
             outputs[0]->attr.size[j++] = inputs[1]->attr.size[i];
         }
-        if(inputs[1]->attr.dim_num == 1)
+        if (inputs[1]->attr.dim_num == 1)
         {
             outputs[0]->attr.size[j++] = inputs[1]->attr.size[0];
         }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
index 61efd47..21e0a17 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c
@@ -164,8 +164,8 @@ static vsi_status _op_optimize
     /*
         insert a reshape node before and after 3D group_norm
     */
-    shape[0] = 1;
-    shape[1] = inputs[0]->attr.size[0];
+    shape[0] = inputs[0]->attr.size[0];
+    shape[1] = 1;
     shape[2] = inputs[0]->attr.size[1];
     shape[3] = inputs[0]->attr.size[2];
     dim = 4;
@@ -203,17 +203,25 @@ static vsi_bool _op_check
 {
     BEGIN_IO_TYPE_DECL(GROUP_NORM, 3, 1)
         IO_TYPE(D_F16,  D_F32,  D_F16,  D_F16)
+        IO_TYPE(D_F16,  D_F32,  D_F32,  D_F16)
         IO_TYPE(D_F16,  D_F32,  D_F16,  D_U8|Q_ASYM)
+        IO_TYPE(D_F16,  D_F32,  D_F32,  D_U8|Q_ASYM)
         IO_TYPE(D_F32,  D_F32,  D_F16,  D_F32)
         IO_TYPE(D_F32,  D_F32,  D_F32,  D_F32)
         IO_TYPE(D_I32,  D_F32,  D_F16,  D_I32)
         IO_TYPE(D_I32,  D_F32,  D_F16,  D_F32)
         IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_F16)
+        IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_F16)
         IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_I8|Q_DFP)
         IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F16)
+        IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_F16)
         IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_U8|Q_ASYM)
+        IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_U8|Q_ASYM)
         IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_F16)
+        IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_F16)
         IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_I16|Q_DFP)
     END_IO_TYPE_DECL(GROUP_NORM)
     if (!VALIDATE_OP_IO_TYPES(GROUP_NORM, self, inputs, self->input.num, outputs, self->output.num))
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gru.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gru.c
index b3aec6e..ad4c2a7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gru.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gru.c
@@ -337,6 +337,8 @@ static vsi_status op_deinit
     vsi_nn_node_t * self
     )
 {
+    vsi_nn_internal_deinit_node_wksp( self );
+
     return VSI_SUCCESS;
 }
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c
index a007884..18ae554 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c
@@ -24,7 +24,6 @@
 #include <string.h>
 #include <stdlib.h>
 
-
 #include "vsi_nn_types.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_log.h"
@@ -73,7 +72,15 @@ static vsi_nn_internal_tensor_t * _create_fc
     }
 
     attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
-    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+    if (input->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ||
+        input->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16)
+    {
+        attr.dtype.vx_type = input->attr.dtype.vx_type;
+    }
+    else
+    {
+        attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+    }
     attr.dim_num = VSI_NN_DIM_AUTO;
     attr.vtl = TRUE;
     attr.is_const = FALSE;
@@ -91,110 +98,6 @@ static vsi_nn_internal_tensor_t * _create_fc
     return fc_out;
 } /* () */
 
-/*
-    copmute the recurrent hstate gates
-    equations:
-      reset_after == True:
-        ht = FC(hstate, kernel_rh, bias_rh)
-        ht = rt * ht
-      reset_after == False:
-        ht = rt * hstate
-        ht = FC(ht, kernel_rh, bias_rh)
-*/
-static vsi_nn_internal_tensor_t * _compute_ht
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t * input_rt,
-    vsi_nn_tensor_t * hstate,
-    vsi_nn_tensor_t * weight,
-    vsi_nn_tensor_t * bias
-    )
-{
-    vsi_bool use_virtual_tensor = TRUE;
-    vsi_nn_grucell_param * p = &self->nn_param.grucell;
-    vsi_nn_internal_tensor_t * tensor1 = NULL, * tensor2 = NULL;
-
-    if(p->reset_after == TRUE)
-    {
-        tensor1 = _create_fc(
-            self,
-            hstate,
-            weight,
-            bias
-        );
-        tensor2 = vsi_nn_rnn_create_binary_operator(
-            self,
-            VSI_NN_OP_MULTIPLY,
-            input_rt,
-            tensor1->t,
-            &input_rt->attr.dtype,
-            use_virtual_tensor
-        );
-    }
-    else
-    {
-        tensor1 = vsi_nn_rnn_create_binary_operator(
-            self,
-            VSI_NN_OP_MULTIPLY,
-            input_rt,
-            hstate,
-            &input_rt->attr.dtype,
-            use_virtual_tensor
-        );
-        tensor2 = _create_fc(
-            self,
-            tensor1->t,
-            weight,
-            bias
-        );
-    }
-
-    return tensor2;
-} /* _compute_ht() */
-
-/*
-    compute the recurrent update gates or reset gates
-    equations:
-      xt = FC(hstate, kernel_xt, bias_xt)
-      xt = input_xt + xt
-      xt = recurrent_activation(xt)
-*/
-static vsi_nn_internal_tensor_t * _compute_recurrent_gate
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t * input_xt,
-    vsi_nn_tensor_t * hstate,
-    vsi_nn_tensor_t * weight,
-    vsi_nn_tensor_t * bias
-    )
-{
-    vsi_bool use_virtual_tensor = TRUE;
-    vsi_nn_grucell_param * p = &self->nn_param.grucell;
-    vsi_nn_internal_tensor_t * tensor_add = NULL, * tensor_act;
-    vsi_nn_internal_tensor_t * recurrent_fc_out = NULL;
-
-    recurrent_fc_out = _create_fc(self, hstate, weight, bias);
-
-    tensor_add = vsi_nn_rnn_create_binary_operator(
-        self,
-        VSI_NN_OP_ADD,
-        recurrent_fc_out->t,
-        input_xt,
-        &recurrent_fc_out->t->attr.dtype,
-        use_virtual_tensor
-    );
-
-    tensor_act = vsi_nn_rnn_create_activation(
-        self,
-        tensor_add->t,
-        p->recurrent_activation,
-        &tensor_add->t->attr.dtype,
-        use_virtual_tensor
-    );
-
-    return tensor_act;
-} /* _compute_recurrent_gate */
-
 static vsi_bool setup_op_shapes
     (
     vsi_nn_node_t * self,
@@ -251,6 +154,8 @@ static vsi_status op_deinit
     vsi_nn_node_t * self
     )
 {
+    vsi_nn_internal_deinit_node_wksp( self );
+
     return VSI_SUCCESS;
 }
 
@@ -265,7 +170,8 @@ static vsi_status op_optimize
     return vsi_nn_internal_optimize_node( self, direction );
 }
 
-static vsi_bool op_setup
+#if 1
+static vsi_bool op_setup_default
     (
     vsi_nn_node_t * self,
     vsi_nn_tensor_t ** inputs,
@@ -276,7 +182,9 @@ static vsi_bool op_setup
     vsi_nn_internal_node_t * curr = NULL;
     vsi_nn_grucell_param * p = &self->nn_param.grucell;
     vsi_nn_internal_tensor_t * input_fc_outputs[GRUCELL_GATE_CNT] = { NULL };
-    vsi_nn_internal_tensor_t * zt = NULL, * rt = NULL, * ht = NULL;
+    vsi_nn_internal_tensor_t * hstate_fc_outputs[GRUCELL_GATE_CNT] = { NULL };
+    vsi_nn_internal_tensor_t * h_times_r = NULL;
+    vsi_nn_tensor_attr_t attr;
 
     vsi_nn_internal_init_node_wksp( self );
 
@@ -294,42 +202,136 @@ static vsi_bool op_setup
         );
     }
 
-    /* compute update gate and reset gate */
-    zt = _compute_recurrent_gate(
-        self,
-        input_fc_outputs[GRUCELL_GATES_Z]->t,
-        inputs[GRUCELL_IN_H_STATE],
-        inputs[GRUCELL_IN_KERNEL_R2Z],
-        inputs[GRUCELL_IN_BIAS_R2Z]
-    );
-    rt = _compute_recurrent_gate(
-        self,
-        input_fc_outputs[GRUCELL_GATES_R]->t,
-        inputs[GRUCELL_IN_H_STATE],
-        inputs[GRUCELL_IN_KERNEL_R2R],
-        inputs[GRUCELL_IN_BIAS_R2R]
-    );
+    /* create hstate fc */
+    for(i = 0; i < GRUCELL_GATE_CNT - 1; i++)
+    {
+        hstate_fc_outputs[i] = _create_fc(
+            self,
+            inputs[GRUCELL_IN_H_STATE],
+            inputs[GRUCELL_IN_KERNEL_R2Z + i],
+            inputs[GRUCELL_IN_BIAS_R2Z + i]
+        );
+    }
 
-    /* compute recurrent h with parameter 'reset_after' */
-    ht = _compute_ht(
+    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
+    attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+    if (inputs[GRUCELL_IN_H_STATE]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 ||
+        self->graph->ctx->config.support_stream_processor)
+    {
+        attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+    }
+    else
+    {
+        attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+    }
+    attr.dim_num = VSI_NN_DIM_AUTO;
+    attr.vtl = TRUE;
+    attr.is_const = FALSE;
+    h_times_r = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+
+    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_H_TIMES_ACTIVATION_R, 3, 1 );
+    curr->node->nn_param.grucell_h_times_activation_r.recurrent_activation = p->recurrent_activation;
+    curr->inputs[0] = inputs[GRUCELL_IN_H_STATE];
+    curr->inputs[1] = input_fc_outputs[GRUCELL_GATES_R]->t;
+    curr->inputs[2] = hstate_fc_outputs[GRUCELL_GATES_R]->t;
+    curr->outputs[0] = h_times_r->t;
+    vsi_nn_internal_setup_node(self, curr);
+
+    hstate_fc_outputs[GRUCELL_GATES_H] = _create_fc(
         self,
-        rt->t,
-        inputs[GRUCELL_IN_H_STATE],
+        h_times_r->t,
         inputs[GRUCELL_IN_KERNEL_R2H],
         inputs[GRUCELL_IN_BIAS_R2H]
     );
 
+    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION_Z_H, 0, 0 );
+    curr->node->nn_param.grucell_activation_z_h.activation = p->activation;
+    curr->node->nn_param.grucell_activation_z_h.recurrent_activation = p->recurrent_activation;
+    curr->inputs[GRUCELL_ACT_Z_H_HSTATE] = inputs[GRUCELL_IN_H_STATE];
+    curr->inputs[GRUCELL_ACT_Z_H_I_FC_Z] = input_fc_outputs[GRUCELL_GATES_Z]->t;
+    curr->inputs[GRUCELL_ACT_Z_H_I_FC_H] = input_fc_outputs[GRUCELL_GATES_H]->t;
+    curr->inputs[GRUCELL_ACT_Z_H_H_FC_Z] = hstate_fc_outputs[GRUCELL_GATES_Z]->t;
+    curr->inputs[GRUCELL_ACT_Z_H_H_FC_H] = hstate_fc_outputs[GRUCELL_GATES_H]->t;
+    curr->outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT] = outputs[GRUCELL_OUT_OUTPUT];
+    curr->outputs[GRUCELL_ACT_Z_H_OUT_HSTATE] = outputs[GRUCELL_OUT_H_STATE];
+    vsi_nn_internal_setup_node(self, curr);
+
+    return TRUE;
+}
+#endif
+
+static vsi_bool op_setup_reset_after
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    uint32_t i;
+    vsi_nn_internal_node_t * curr = NULL;
+    vsi_nn_grucell_param * p = &self->nn_param.grucell;
+    vsi_nn_internal_tensor_t * input_fc_outputs[GRUCELL_GATE_CNT] = { NULL };
+    vsi_nn_internal_tensor_t * hstate_fc_outputs[GRUCELL_GATE_CNT] = { NULL };
+
+    vsi_nn_internal_init_node_wksp( self );
+
+    /* compute output tensor's shapes */
+    setup_op_shapes(self, inputs, outputs);
+
+    /* create input fc */
+    for(i = 0; i < GRUCELL_GATE_CNT; i++)
+    {
+        input_fc_outputs[i] = _create_fc(
+            self,
+            inputs[GRUCELL_IN_INPUT],
+            inputs[GRUCELL_IN_KERNEL_I2Z + i],
+            inputs[GRUCELL_IN_BIAS_I2Z + i]
+        );
+    }
+
+    /* create hstate fc */
+    for(i = 0; i < GRUCELL_GATE_CNT; i++)
+    {
+        hstate_fc_outputs[i] = _create_fc(
+            self,
+            inputs[GRUCELL_IN_H_STATE],
+            inputs[GRUCELL_IN_KERNEL_R2Z + i],
+            inputs[GRUCELL_IN_BIAS_R2Z + i]
+        );
+    }
+
     curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION, 0, 0 );
     curr->node->nn_param.grucell_activation.activation = p->activation;
-    curr->inputs[GRUCELL_ACT_IN_H_STATE] = inputs[GRUCELL_IN_H_STATE];
-    curr->inputs[GRUCELL_ACT_IN_INPUT_FC_H] = input_fc_outputs[GRUCELL_GATES_H]->t;
-    curr->inputs[GRUCELL_ACT_IN_H_T] = ht->t;
-    curr->inputs[GRUCELL_ACT_IN_Z_T] = zt->t;
+    curr->node->nn_param.grucell_activation.recurrent_activation = p->recurrent_activation;
+    curr->inputs[GRUCELL_ACT_H_STATE] = inputs[GRUCELL_IN_H_STATE];
+    curr->inputs[GRUCELL_ACT_I_FC_Z] = input_fc_outputs[GRUCELL_GATES_Z]->t;
+    curr->inputs[GRUCELL_ACT_I_FC_R] = input_fc_outputs[GRUCELL_GATES_R]->t;
+    curr->inputs[GRUCELL_ACT_I_FC_H] = input_fc_outputs[GRUCELL_GATES_H]->t;
+    curr->inputs[GRUCELL_ACT_H_FC_Z] = hstate_fc_outputs[GRUCELL_GATES_Z]->t;
+    curr->inputs[GRUCELL_ACT_H_FC_R] = hstate_fc_outputs[GRUCELL_GATES_R]->t;
+    curr->inputs[GRUCELL_ACT_H_FC_H] = hstate_fc_outputs[GRUCELL_GATES_H]->t;
     curr->outputs[GRUCELL_ACT_OUT_OUTPUT] = outputs[GRUCELL_OUT_OUTPUT];
     curr->outputs[GRUCELL_ACT_OUT_H_STATE] = outputs[GRUCELL_OUT_H_STATE];
     vsi_nn_internal_setup_node(self, curr);
 
     return TRUE;
+}
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    if (self->nn_param.grucell.reset_after == TRUE)
+    {
+        return op_setup_reset_after(self, inputs, outputs);
+    }
+    else
+    {
+        return op_setup_default(self, inputs, outputs);
+    }
 } /* op_setup() */
 
 #ifdef __cplusplus
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation.c
index 2af4c6e..4fcd612 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation.c
@@ -21,24 +21,18 @@
 *    DEALINGS IN THE SOFTWARE.
 *
 *****************************************************************************/
+
+
 #include <string.h>
 #include <stdlib.h>
-
 #include "vsi_nn_types.h"
-#include "vsi_nn_platform.h"
 #include "vsi_nn_log.h"
-#include "vsi_nn_graph.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
-#include "vsi_nn_tensor_util.h"
-#include "vsi_nn_internal_node.h"
-#include "vsi_nn_rnn_helper.h"
-#include "utils/vsi_nn_math.h"
-#include "utils/vsi_nn_tensor_op.h"
 #include "utils/vsi_nn_util.h"
-#include "ops/vsi_nn_op_grucell_activation.h"
+#include "kernel/vsi_nn_kernel.h"
 
 typedef struct _vsi_nn_grucell_activation_local {
     void * placeholder;
@@ -51,8 +45,28 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    return vsi_nn_internal_compute_node( self );
-}
+    vsi_nn_grucell_activation_param* p = &self->nn_param.grucell_activation;
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t* param;
+
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_int32(param, "activation", p->activation);
+    vsi_nn_kernel_param_add_int32(param, "recurrent_activation", p->recurrent_activation);
+
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "grucell_reset_after_activation",
+        inputs, GRUCELL_ACT_IN_CNT,
+        outputs, GRUCELL_ACT_OUT_CNT,
+        param );
+
+    if( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    vsi_nn_kernel_param_release( &param );
+
+    return status;
+} /* op_compute() */
 
 static vsi_bool op_check
     (
@@ -61,8 +75,9 @@ static vsi_bool op_check
     vsi_nn_tensor_t ** outputs
     )
 {
+    /*TODO: Check tensor shapes. */
     return TRUE;
-}
+} /* op_check() */
 
 static vsi_bool op_setup
     (
@@ -71,110 +86,43 @@ static vsi_bool op_setup
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_bool use_virtual_tensor= TRUE;
-    vsi_nn_grucell_activation_param * p = &self->nn_param.grucell_activation;
-    vsi_nn_internal_tensor_t * tmp_sub = NULL, * tmp_add = NULL, * tmp_mul = NULL;
-    vsi_nn_internal_tensor_t * tmp_act = NULL;
-    vsi_nn_internal_node_t * curr = NULL;
-
-    vsi_nn_internal_init_node_wksp( self );
-
-    if(VSI_NN_DIM_AUTO == outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dim_num)
+    if (VSI_NN_DIM_AUTO == outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dim_num)
     {
-        outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dim_num = 2;
-        outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.size[0] = inputs[GRUCELL_ACT_IN_H_STATE]->attr.size[0];
-        outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.size[1] = inputs[GRUCELL_ACT_IN_H_STATE]->attr.size[1];
+        outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dim_num = \
+            inputs[GRUCELL_ACT_H_STATE]->attr.dim_num;
+
+        memcpy( outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.size,
+            inputs[GRUCELL_ACT_H_STATE]->attr.size,
+            inputs[GRUCELL_ACT_H_STATE]->attr.dim_num * sizeof(vsi_size_t) );
     }
 
-    /*
-        hht = activation(fc_h + ht)
-    */
-    tmp_add = vsi_nn_rnn_create_binary_operator(
-        self,
-        VSI_NN_OP_ADD,
-        inputs[GRUCELL_ACT_IN_INPUT_FC_H],
-        inputs[GRUCELL_ACT_IN_H_T],
-        &inputs[GRUCELL_ACT_IN_INPUT_FC_H]->attr.dtype,
-        use_virtual_tensor
-    );
-    tmp_act = vsi_nn_rnn_create_activation(
-        self,
-        tmp_add->t,
-        p->activation,
-        &tmp_add->t->attr.dtype,
-        use_virtual_tensor
-    );
+    if (VSI_NN_DIM_AUTO == outputs[GRUCELL_ACT_OUT_H_STATE]->attr.dim_num)
+    {
+        outputs[GRUCELL_ACT_OUT_H_STATE]->attr.dim_num = \
+            inputs[GRUCELL_ACT_H_STATE]->attr.dim_num;
 
-    /*
-        new_h = zt * (hstate - hht) + hht
-    */
-    tmp_sub = vsi_nn_rnn_create_binary_operator(
-        self,
-        VSI_NN_OP_SUBTRACT,
-        inputs[GRUCELL_ACT_IN_H_STATE],
-        tmp_act->t,
-        &tmp_act->t->attr.dtype,
-        use_virtual_tensor
-    );
-    tmp_mul = vsi_nn_rnn_create_binary_operator(
-        self,
-        VSI_NN_OP_MULTIPLY,
-        inputs[GRUCELL_ACT_IN_Z_T],
-        tmp_sub->t,
-        &tmp_sub->t->attr.dtype,
-        use_virtual_tensor
-    );
-
-    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 );
-    curr->inputs[0] = tmp_mul->t;
-    curr->inputs[1] = tmp_act->t;
-    curr->outputs[0] = outputs[GRUCELL_ACT_OUT_OUTPUT];
-    vsi_nn_internal_setup_node(self, curr);
-
-    /* copy outputs to h_state */
-    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
-    curr->inputs[0] = outputs[GRUCELL_ACT_OUT_OUTPUT];
-    curr->outputs[0] = outputs[GRUCELL_ACT_OUT_H_STATE];
-    vsi_nn_internal_setup_node(self, curr);
+        memcpy( outputs[GRUCELL_ACT_OUT_H_STATE]->attr.size,
+            inputs[GRUCELL_ACT_H_STATE]->attr.size,
+            inputs[GRUCELL_ACT_H_STATE]->attr.dim_num * sizeof(vsi_size_t) );
+    }
 
     return TRUE;
-}
+} /* op_setup() */
 
-static vsi_status op_deinit
-    (
-    vsi_nn_node_t * self
-    )
-{
-    return VSI_SUCCESS;
-}
+__BEGIN_DECLS
 
-static vsi_status op_optimize
-    (
-    vsi_nn_node_t * self,
-    vsi_nn_tensor_t ** inputs,
-    vsi_nn_tensor_t ** outputs,
-    vsi_nn_opt_direction_e direction
-    )
-{
-    return vsi_nn_internal_optimize_node( self, direction );
-}
-
-#ifdef __cplusplus
-extern "C" {
-#endif
 /* Registrar */
 DEF_OP_REG
     (
     /* op_name    */ GRUCELL_ACTIVATION,
     /* init       */ NULL,
     /* compute    */ op_compute,
-    /* deinit     */ op_deinit,
+    /* deinit     */ vsi_nn_op_common_deinit,
     /* check      */ op_check,
     /* setup      */ op_setup,
-    /* optimize   */ op_optimize,
+    /* optimize   */ NULL,
     /* input_num  */ GRUCELL_ACT_IN_CNT,
     /* output_num */ GRUCELL_ACT_OUT_CNT
     );
-#ifdef __cplusplus
-}
-#endif
+
+__END_DECLS
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_z_h.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_z_h.c
new file mode 100644
index 0000000..46eff0d
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_z_h.c
@@ -0,0 +1,129 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+typedef struct _grucell_activation_z_h_local_data_t {
+    int32_t placeholder;
+} grucell_activation_z_h_local_data_t;
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_grucell_activation_param* p = &self->nn_param.grucell_activation;
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t* param;
+
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_int32(param, "activation", p->activation);
+    vsi_nn_kernel_param_add_int32(param, "recurrent_activation", p->recurrent_activation);
+
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "grucell_activation_z_h",
+        inputs, GRUCELL_ACT_Z_H_IN_CNT,
+        outputs, GRUCELL_ACT_Z_H_OUT_CNT,
+        param );
+
+    if( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    vsi_nn_kernel_param_release( &param );
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    /*TODO: Check tensor shapes. */
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    if (VSI_NN_DIM_AUTO == outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]->attr.dim_num)
+    {
+        outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]->attr.dim_num = \
+            inputs[GRUCELL_ACT_Z_H_HSTATE]->attr.dim_num;
+
+        memcpy( outputs[GRUCELL_ACT_Z_H_OUT_OUTPUT]->attr.size,
+            inputs[GRUCELL_ACT_Z_H_HSTATE]->attr.size,
+            inputs[GRUCELL_ACT_Z_H_HSTATE]->attr.dim_num * sizeof(vsi_size_t) );
+    }
+
+    if (VSI_NN_DIM_AUTO == outputs[GRUCELL_ACT_Z_H_OUT_HSTATE]->attr.dim_num)
+    {
+        outputs[GRUCELL_ACT_Z_H_OUT_HSTATE]->attr.dim_num = \
+            inputs[GRUCELL_ACT_Z_H_HSTATE]->attr.dim_num;
+
+        memcpy( outputs[GRUCELL_ACT_Z_H_OUT_HSTATE]->attr.size,
+            inputs[GRUCELL_ACT_Z_H_HSTATE]->attr.size,
+            inputs[GRUCELL_ACT_Z_H_HSTATE]->attr.dim_num * sizeof(vsi_size_t) );
+    }
+
+    return TRUE;
+} /* op_setup() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ GRUCELL_ACTIVATION_Z_H,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ GRUCELL_ACT_Z_H_IN_CNT,
+    /* output_num */ GRUCELL_ACT_Z_H_OUT_CNT
+    );
+
+__END_DECLS
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_h_times_activation_r.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_h_times_activation_r.c
new file mode 100644
index 0000000..e1e4480
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_h_times_activation_r.c
@@ -0,0 +1,124 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+typedef struct _grucell_h_times_activation_r_local_data_t {
+    int32_t placeholder;
+} grucell_h_times_activation_r_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (3)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_grucell_h_times_activation_r_param* p = &self->nn_param.grucell_h_times_activation_r;
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t* param;
+
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_int32(param, "recurrent_activation", p->recurrent_activation);
+
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "grucell_h_times_activation_r",
+        inputs, _INPUT_NUM,
+        outputs, _OUTPUT_NUM,
+        param );
+
+    if( self->n )
+    {
+        status = VSI_SUCCESS;
+    }
+
+    vsi_nn_kernel_param_release( &param );
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    /*TODO: Check tensor shapes. */
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num)
+    {
+        outputs[0]->attr.dim_num = \
+            inputs[0]->attr.dim_num;
+
+        memcpy( outputs[0]->attr.size,
+            inputs[0]->attr.size,
+            inputs[0]->attr.dim_num * sizeof(vsi_size_t) );
+    }
+
+    return TRUE;
+} /* op_setup() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ GRUCELL_H_TIMES_ACTIVATION_R,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c
index 1814c51..31df29c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c
@@ -1022,6 +1022,7 @@ static vsi_bool op_setup_default
             && (p->local->multi_batch))
         {
             vsi_nn_tensor_t* wei_r2c_tensor = NULL;
+            vsi_nn_tensor_t* bias_r2c_tensor = NULL;
 
             memcpy(&attr, &(inputs[GRUCELL_INPUT_WEIGHT_H2C]->attr), sizeof(attr));
             attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
@@ -1036,10 +1037,12 @@ static vsi_bool op_setup_default
             }
 
             wei_r2c_tensor = vsi_nn_ConvertTensorDtype(self->graph, inputs[GRUCELL_INPUT_WEIGHT_H2C], &(attr.dtype));
+            attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;
+            bias_r2c_tensor = vsi_nn_ConvertTensorDtype(self->graph, inputs[GRUCELL_INPUT_BIAS_H2C], &(attr.dtype));
             rh_cand_fc_output = vsi_nn_rnn_create_tp_fc(self,
                                     rh_mul_outputs->t,
                                     wei_r2c_tensor,
-                                    inputs[GRUCELL_INPUT_BIAS_H2C],
+                                    bias_r2c_tensor,
                                     &p->internal_dtype[GRUCELL_QUANTIZE_PARAM_H2C],
                                     use_virtual_tensor);
         }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
index ce7290d..ed652c3 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
@@ -130,7 +130,7 @@ static vsi_status op_compute
         }
     }
 
-    param =vsi_nn_kernel_param_create();
+    param = vsi_nn_kernel_param_create();
     vsi_nn_kernel_param_add_float32( param, "eps", eps );
     vsi_nn_kernel_param_add_int32( param, "reshape_flg", rs_flg );
     n = vsi_nn_kernel_selector( self->graph, "instance_norm",
@@ -172,8 +172,8 @@ static vsi_status op_optimize
     /*
         insert a reshape node before and after 3D instance_norm
     */
-    shape[0] = 1;
-    shape[1] = inputs[0]->attr.size[0];
+    shape[0] = inputs[0]->attr.size[0];
+    shape[1] = 1;
     shape[2] = inputs[0]->attr.size[1];
     shape[3] = inputs[0]->attr.size[2];
     dim = 4;
@@ -320,4 +320,3 @@ DEF_OP_REG
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c
index 055dbd9..2df9bc2 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2_normalize.c
@@ -32,6 +32,7 @@
 #include "utils/vsi_nn_math.h"
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "kernel/vsi_nn_kernel.h"
 
 static vsi_status op_compute
     (
@@ -41,81 +42,24 @@ static vsi_status op_compute
     )
 {
     vsi_status status = VX_FAILURE;
-#ifdef VX_L2NORM_AXIS_PARAMETER_SUPPORT
-    vx_nn_l2norm_params_t param;
+    int32_t axis = self->nn_param.l2_normalize.axis;
+    vsi_nn_kernel_param_t * param = NULL;
 
-    param.axis = self->nn_param.l2_normalize.axis;
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_int32( param, "axis", axis );
 
-    self->n = vxL2NormalizeLayer2(
-        self->graph->g,
-        inputs[0]->t,
-        &param,
-        sizeof(vx_nn_l2norm_params_t),
-        outputs[0]->t
-        );
-
-    if( NULL != self->n )
-    {
-        status = VSI_SUCCESS;
-    }
-#else
-    vsi_nn_l2_normalize_param * p;
-    int32_t axis = -1;
-    uint32_t i = 0;
-    uint32_t sizes[VSI_NN_MAX_DIM_NUM] = {1};
-    uint32_t innerSize = 1;
-    uint32_t outerSize = 1;
-    uint32_t axisSize  = 1;
-    vx_tensor vx_input = NULL;
-    vx_tensor vx_output = NULL;
-    vx_tensor input = inputs[0]->t;
-    vx_tensor output = outputs[0]->t;
-
-    status = VSI_FAILURE;
-
-    p = &(self->nn_param.l2_normalize);
-    axis = p->axis;
-
-    if (axis != 2)
-    {
-        axisSize  = inputs[0]->attr.size[axis];
-
-        for (i = 0; i < (uint32_t)axis; i++)
-        {
-            innerSize *= inputs[0]->attr.size[i];
-        }
-
-        for (i = (uint32_t)(axis + 1); i < inputs[0]->attr.dim_num; i++)
-        {
-            outerSize *= inputs[0]->attr.size[i];
-        }
-
-        sizes[0] = innerSize;
-        sizes[1] = 1;
-        sizes[2] = axisSize;
-        sizes[3] = outerSize;
-
-        vx_input = vxReshapeTensor(inputs[0]->t, (int32_t *)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4));
-        vx_output = vxReshapeTensor(outputs[0]->t, (int32_t *)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4));
-
-        input = vx_input;
-        output = vx_output;
-    }
-
-    self->n = vxL2NormalizeLayer(
-        self->graph->g,
-        input,
-        output
-        );
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+        "l2_norm",
+        inputs, 1,
+        outputs, 1, param );;
 
     if( NULL != self->n )
     {
         status = VSI_SUCCESS;
     }
 
-    if (vx_input) vxReleaseTensor(&vx_input);
-    if (vx_output) vxReleaseTensor(&vx_output);
-#endif
+    vsi_nn_kernel_param_release( &param );
+
     return status;
 } /* op_compute() */
 
@@ -189,4 +133,3 @@ DEF_OP_REG
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c b/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c
index 50bdef0..69e27a1 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_linear.c
@@ -22,7 +22,6 @@
 *
 *****************************************************************************/
 
-
 #include <string.h>
 #include <stdlib.h>
 
@@ -36,7 +35,6 @@
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_constraint_check.h"
 
-
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)
 
@@ -47,23 +45,23 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status;
-    status = VSI_FAILURE;
+    vsi_status status = VSI_SUCCESS;
+    vsi_nn_kernel_param_t * param;
+    vsi_nn_kernel_node_t    n;
 
-    self->n = vxActivationLayer(
-        self->graph->g,
-        inputs[0]->t,
-        VX_CONVOLUTIONAL_NETWORK_ACTIVATION_LINEAR,
-        self->nn_param.linear.a,
-        self->nn_param.linear.b,
-        outputs[0]->t
-        );
+    param = vsi_nn_kernel_param_create();
 
-    if( NULL != self->n )
+    vsi_nn_kernel_param_add_float32( param, "a_v", self->nn_param.linear.a );
+    vsi_nn_kernel_param_add_float32( param, "b_v", self->nn_param.linear.b );
+
+    n = vsi_nn_kernel_selector( self->graph, "linear", inputs, 1, outputs, 1, param );
+    if( n == NULL )
     {
-        status = VSI_SUCCESS;
+        status = VSI_FAILURE;
     }
 
+    vsi_nn_kernel_param_release( &param );
+
     return status;
 } /* op_compute() */
 
@@ -103,7 +101,6 @@ static vsi_bool op_check
     return TRUE;
 } /* op_check() */
 
-
 __BEGIN_DECLS
 
 /* Registrar */
@@ -121,4 +118,3 @@ DEF_OP_REG
     );
 
 __END_DECLS
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lrn2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lrn2.c
index ee65331..fd3e610 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lrn2.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lrn2.c
@@ -86,13 +86,10 @@ static vsi_status op_compute
 
         if(outerSize < MAX_BATCH_COUNT)
         {
-#ifdef VSI_40BIT_VA_SUPPORT
-            vx_input = vxReshapeTensor(inputs[0]->t, sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4));
-            vx_output = vxReshapeTensor(outputs[0]->t, sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4));
-#else
-            vx_input = vxReshapeTensor(inputs[0]->t, (int32_t*)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4));
-            vx_output = vxReshapeTensor(outputs[0]->t, (int32_t*)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4));
-#endif
+            vx_input = vsi_nn_safe_reshape_tensor(inputs[0]->t,
+                (void*)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4), sizeof(sizes[0]));
+            vx_output = vsi_nn_safe_reshape_tensor(outputs[0]->t,
+                (void*)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4), sizeof(sizes[0]));
 
             input = vx_input;
             output = vx_output;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
index 3db70e8..a6c5c63 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_activation.c
@@ -98,7 +98,6 @@ static vsi_status op_compute
     }
 
     return status;
-
 } /* op_compute() */
 
 static vsi_bool op_check
@@ -136,7 +135,14 @@ static vsi_bool op_setup
 
     p->is_cifg = inputs[LSTMUNIT_ACT_INPUT_FC_I] == NULL;
     p->is_projection = outputs[LSTMUNIT_ACT_HSTATE_OUT] == NULL;
-    p->is_layer_norm = inputs[LSTMUNIT_ACT_LN_WF] != NULL;
+    if (self->graph->ctx->config.support_stream_processor)
+    {
+        p->is_layer_norm = inputs[LSTMUNIT_ACT_HSTATE_FC_F] == NULL;
+    }
+    else
+    {
+        p->is_layer_norm = inputs[LSTMUNIT_ACT_LN_WF] != NULL;
+    }
     p->is_hybrid = p->is_layer_norm ? 0 : inputs[LSTMUNIT_ACT_DATA_BF] != NULL;
     p->recurrent_activation = p->recurrent_activation == VSI_NN_ACT_NONE ?
         VSI_NN_ACT_SIGMOID : p->recurrent_activation;
@@ -221,7 +227,6 @@ static vsi_bool op_setup
     }
 
     return TRUE;
-
 } /* op_setup() */
 
 static vsi_status op_deinit
@@ -229,7 +234,6 @@ static vsi_status op_deinit
     vsi_nn_node_t * self
     )
 {
-
     vsi_status status = VSI_SUCCESS;
     int32_t i = 0;
 
@@ -249,7 +253,6 @@ static vsi_status op_deinit
     }
 
     return status;
-
 } /* op_deinit() */
 
 static vsi_status op_init
@@ -257,13 +260,11 @@ static vsi_status op_init
     vsi_nn_node_t * self
     )
 {
-
     vsi_status status = VSI_SUCCESS;
 
     self->nn_param.lstmunit_activation.recurrent_activation = VSI_NN_ACT_SIGMOID;
 
     return status;
-
 } /* op_init() */
 
 #ifdef __cpluplus
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c
index d5d5123..5433281 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c
@@ -41,7 +41,6 @@
 #include "vsi_nn_rnn_helper.h"
 #include "utils/vsi_nn_dtype_util.h"
 
-
 static vsi_nn_internal_tensor_t* create_tp_fc
     (
     vsi_nn_node_t * self,
@@ -54,18 +53,13 @@ static vsi_nn_internal_tensor_t* create_tp_fc
 {
     vsi_nn_lstmunit_ovxlib_param* p = &self->nn_param.lstmunit_ovxlib;
     vsi_nn_tensor_attr_t attr;
-    vsi_nn_tensor_t* tensor = NULL;
-    vsi_nn_internal_tensor_t* tensor1 = NULL;
     vsi_nn_internal_tensor_t* tensor2 = NULL;
     vsi_nn_internal_node_t* tmp_inode = NULL;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
-    tensor = bias;
     if( !bias || p->local->use_layer_norm || p->local->use_hybrid )
     {
-        /* create zero bias for NN/TP */
-        tensor1 = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr, VSI_NN_OP_FCL, FALSE);
-        tensor = tensor1->t;
+        bias = NULL;
     }
 
     vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor);
@@ -77,7 +71,7 @@ static vsi_nn_internal_tensor_t* create_tp_fc
 
     tmp_inode->inputs[0] = input;
     tmp_inode->inputs[1] = weight;
-    tmp_inode->inputs[2] = tensor;
+    tmp_inode->inputs[2] = bias;
     tmp_inode->outputs[0] = tensor2->t;
     vsi_nn_internal_setup_node(self, tmp_inode);
 
@@ -98,21 +92,15 @@ static vsi_nn_internal_tensor_t* create_nn_fc
 {
     vsi_nn_lstmunit_ovxlib_param* p = &self->nn_param.lstmunit_ovxlib;
     vsi_nn_tensor_attr_t attr;
-    vsi_nn_tensor_t* tensor = NULL;
-    vsi_nn_internal_tensor_t* tensor1 = NULL;
     vsi_nn_internal_tensor_t* tensor2 = NULL;
     vsi_nn_internal_tensor_t* reshaped_weight_tensor = NULL;
     vsi_size_t reshaped_weight_shape[VSI_NN_MAX_DIM_NUM] = { 0 };
     vsi_nn_internal_node_t* tmp_inode = NULL;
 
     memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
-    tensor = bias;
     if( !bias || p->local->use_layer_norm || p->local->use_hybrid )
     {
-        /* create zero bias for NN/TP */
-        tensor1 = vsi_nn_internal_create_zero_bias_tensor(
-            self, &input->attr, &weight->attr, VSI_NN_OP_FCL, FALSE);
-        tensor = tensor1->t;
+        bias = NULL;
     }
 
     vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor);
@@ -149,7 +137,7 @@ static vsi_nn_internal_tensor_t* create_nn_fc
 
     tmp_inode->inputs[0] = input;
     tmp_inode->inputs[1] = reshaped_weight_tensor->t;
-    tmp_inode->inputs[2] = tensor;
+    tmp_inode->inputs[2] = bias;
     tmp_inode->outputs[0] = tensor2->t;
     vsi_nn_internal_setup_node(self, tmp_inode);
 
@@ -284,6 +272,7 @@ static vsi_bool op_setup
     vsi_nn_tensor_attr_t attr;
     vsi_bool is_input_fc_on_tp = FALSE;
     vsi_bool is_recurrent_fc_on_tp = FALSE;
+    vsi_nn_internal_tensor_t* add_tensor = NULL;
     vsi_nn_internal_tensor_t* input_tensor = NULL;
     vsi_nn_internal_tensor_t* output_tensor = NULL;
     vsi_nn_internal_tensor_t* recurrent_input_tensor = NULL;
@@ -509,23 +498,54 @@ static vsi_bool op_setup
     {
         for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++ )
         {
-            memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
-            attr.dim_num = VSI_NN_DIM_AUTO;
-            attr.vtl = use_virtual_tensor;
-            attr.is_const = FALSE;
-            attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
-            attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
-            input_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+            if (self->graph->ctx->config.support_stream_processor)
+            {
+                memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+                attr.dim_num = VSI_NN_DIM_AUTO;
+                attr.vtl = use_virtual_tensor;
+                attr.is_const = FALSE;
+                attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+                attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+                add_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+                /* create internal nodes */
+                curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 );
+                curr->inputs[0] = input_fc_outputs[i]->t;
+                curr->inputs[1] = recurrent_fc_outputs[i]->t;
+                curr->outputs[0] = add_tensor->t;
+                vsi_nn_internal_setup_node(self, curr);
 
-            /* create internal nodes */
-            curr = vsi_nn_internal_new_node( self, VSI_NN_OP_TENSOR_ADD_MEAN_STDDEV_NORM, 0, 0 );
-            curr->node->nn_param.tensor_add_mean_stddev_norm.eps = (float)1e-8;
-            curr->inputs[0] = input_fc_outputs[i]->t;
-            curr->inputs[1] = recurrent_fc_outputs[i]->t;
-            curr->outputs[0] = input_tensor->t;
-            vsi_nn_internal_setup_node(self, curr);
+                /* create internal nodes */
+                input_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+                curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LAYER_NORM, 0, 0 );
+                curr->node->nn_param.layernorm.eps = (float)1e-8;
+                curr->inputs[0] = add_tensor->t;
+                curr->inputs[1] = inputs[LSTMUNIT_INPUT_BIAS_I + i];
+                curr->inputs[2] = inputs[LSTMUNIT_INPUT_LAYERNORM_I + i];
+                curr->outputs[0] = input_tensor->t;
+                vsi_nn_internal_setup_node(self, curr);
 
-            layernorm_outputs[i] = input_tensor;
+                layernorm_outputs[i] = input_tensor;
+            }
+            else
+            {
+                memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
+                attr.dim_num = VSI_NN_DIM_AUTO;
+                attr.vtl = use_virtual_tensor;
+                attr.is_const = FALSE;
+                attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+                attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16;
+                input_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+
+                /* create internal nodes */
+                curr = vsi_nn_internal_new_node( self, VSI_NN_OP_TENSOR_ADD_MEAN_STDDEV_NORM, 0, 0 );
+                curr->node->nn_param.tensor_add_mean_stddev_norm.eps = (float)1e-8;
+                curr->inputs[0] = input_fc_outputs[i]->t;
+                curr->inputs[1] = recurrent_fc_outputs[i]->t;
+                curr->outputs[0] = input_tensor->t;
+                vsi_nn_internal_setup_node(self, curr);
+
+                layernorm_outputs[i] = input_tensor;
+            }
         }
     }
 
@@ -544,7 +564,8 @@ static vsi_bool op_setup
     curr->inputs[LSTMUNIT_ACT_CSTATE_IN] = inputs[LSTMUNIT_INPUT_C_STATE];
     for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++ )
     {
-        if( p->local->use_layer_norm || p->local->use_hybrid )
+        if( (p->local->use_layer_norm && !self->graph->ctx->config.support_stream_processor) ||
+            p->local->use_hybrid )
         {
             curr->inputs[LSTMUNIT_ACT_DATA_BI + i] = inputs[LSTMUNIT_INPUT_BIAS_I + i];
         }
@@ -552,7 +573,14 @@ static vsi_bool op_setup
         if( p->local->use_layer_norm )
         {
             /* Pass layernorm weights to VSI_NN_OP_LSTMUNIT_ACTIVATION */
-            curr->inputs[LSTMUNIT_ACT_LN_WI + i] = inputs[LSTMUNIT_INPUT_LAYERNORM_I + i];
+            if (self->graph->ctx->config.support_stream_processor)
+            {
+                curr->inputs[LSTMUNIT_ACT_LN_WI + i] = NULL;
+            }
+            else
+            {
+                curr->inputs[LSTMUNIT_ACT_LN_WI + i] = inputs[LSTMUNIT_INPUT_LAYERNORM_I + i];
+            }
             curr->inputs[LSTMUNIT_ACT_INPUT_FC_I + i] = layernorm_outputs[i]->t;
             curr->inputs[LSTMUNIT_ACT_HSTATE_FC_I + i] = NULL;
         }
@@ -644,17 +672,7 @@ static vsi_bool op_setup
         curr->inputs[2] = zero_bias_tensor;
 
         /* Save output to h_state first and copy to output */
-        if( p->local->use_hybrid && p->local->use_projection_bias )
-        {
-            vsi_nn_internal_init_tensor_attr(&attr,
-                &outputs[LSTMUNIT_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor);
-            output_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
-            curr->outputs[0] = output_tensor->t;
-        }
-        else
-        {
-            curr->outputs[0] = outputs[LSTMUNIT_OUTPUT_H_STATE];
-        }
+        curr->outputs[0] = outputs[LSTMUNIT_OUTPUT_H_STATE];
 
         vsi_nn_internal_setup_node(self, curr);
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c
index eaeaaa5..fcf29f4 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_matrixmul.c
@@ -30,6 +30,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
@@ -51,6 +52,13 @@ static vsi_status op_compute
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_param_t * param = NULL;
     vsi_nn_kernel_node_t    n = NULL;
+    vsi_nn_tensor_t * tmp_inputs[2]  = {NULL};
+    vsi_nn_tensor_t * tmp_outputs[1] = {NULL};
+    vsi_nn_tensor_t * rs_input = NULL;
+    vsi_nn_tensor_t * rs_output = NULL;
+    vsi_size_t shape_in[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
+    vsi_size_t shape_out[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
+    uint32_t i = 0;
 
     int32_t transposeA  = self->nn_param.matrixmul.transpose[0];
     int32_t transposeB  = self->nn_param.matrixmul.transpose[1];
@@ -64,7 +72,47 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_int32( param, "adjointA", adjointA );
     vsi_nn_kernel_param_add_int32( param, "adjointB", adjointB );
 
-    n = vsi_nn_kernel_selector( self->graph, "matrixmul", inputs, 2, outputs, 1, param );
+    if (inputs[0]->attr.dim_num == 1 && inputs[1]->attr.dim_num > 1)
+    {
+        shape_in[0]    = inputs[0]->attr.size[0];
+        shape_in[1]    = 1;
+        shape_out[0]   = outputs[0]->attr.size[0];
+        shape_out[1]   = 1;
+        for(i = 2; i <= outputs[0]->attr.dim_num; i++)
+        {
+            shape_out[i]   = outputs[0]->attr.size[i - 1];
+        }
+        rs_input       = vsi_nn_reshape_tensor(self->graph, inputs[0], shape_in, 2);
+        rs_output      = vsi_nn_reshape_tensor(self->graph, outputs[0], shape_out, outputs[0]->attr.dim_num + 1);
+        tmp_inputs[0]  = rs_input;
+        tmp_inputs[1]  = inputs[1];
+        tmp_outputs[0] = rs_output;
+    }
+    else if (inputs[1]->attr.dim_num == 1 && inputs[0]->attr.dim_num > 1)
+    {
+        shape_in[0]    = 1;
+        shape_in[1]    = inputs[1]->attr.size[0];
+
+        shape_out[0]   = 1;
+        for(i = 1; i <= outputs[0]->attr.dim_num; i++)
+        {
+            shape_out[i]   = outputs[0]->attr.size[i - 1];
+        }
+        rs_input       = vsi_nn_reshape_tensor(self->graph, inputs[1], shape_in, 2);
+        rs_output      = vsi_nn_reshape_tensor(self->graph, outputs[0], shape_out, outputs[0]->attr.dim_num + 1);
+
+        tmp_inputs[0]  = inputs[0];
+        tmp_inputs[1]  = rs_input;
+        tmp_outputs[0] = rs_output;
+    }
+    else
+    {
+        tmp_inputs[0]   = inputs[0];
+        tmp_inputs[1]   = inputs[1];
+        tmp_outputs[0]  = outputs[0];
+    }
+
+    n = vsi_nn_kernel_selector( self->graph, "matrixmul", tmp_inputs, 2, tmp_outputs, 1, param );
     if ( n != NULL )
     {
         self->n = (vx_node)n;
@@ -76,6 +124,15 @@ static vsi_status op_compute
         vsi_nn_kernel_param_release( &param );
     }
 
+    if (rs_input != NULL)
+    {
+        vsi_nn_ReleaseTensor( &rs_input );
+    }
+    if (rs_output != NULL)
+    {
+        vsi_nn_ReleaseTensor( &rs_output );
+    }
+
     return status;
 } /* op_compute() */
 
@@ -126,23 +183,32 @@ static vsi_bool op_check
         return FALSE;
     }
 
-    if (self->nn_param.matrixmul.transpose[0] == FALSE
+    if ((inputs[0]->attr.dim_num == 1 || inputs[1]->attr.dim_num == 1)
+        && (self->nn_param.matrixmul.transpose[0] == TRUE || self->nn_param.matrixmul.transpose[1] == TRUE))
+    {
+         VSILOGE("Transpose parameters should be all false when input tensor is 1D");
+         return FALSE;
+    }
+    else if (self->nn_param.matrixmul.transpose[0] == FALSE
         && self->nn_param.matrixmul.transpose[1] == FALSE
-        && inputs[0]->attr.size[0] != inputs[1]->attr.size[1])
+        && inputs[0]->attr.size[0] != inputs[1]->attr.size[1]
+        && inputs[0]->attr.dim_num > 1 && inputs[1]->attr.dim_num > 1)
     {
          VSILOGE("1st input tensor's size[0] is not equal to 2nd input tensor's size[1]");
          return FALSE;
     }
     else if (self->nn_param.matrixmul.transpose[0] == TRUE
         && self->nn_param.matrixmul.transpose[1] == FALSE
-        && inputs[0]->attr.size[1] != inputs[1]->attr.size[1])
+        && inputs[0]->attr.size[1] != inputs[1]->attr.size[1]
+        && inputs[0]->attr.dim_num > 1 && inputs[1]->attr.dim_num > 1)
     {
          VSILOGE("1st input tensor's size[1] is not equal to 2nd input tensor's size[1]");
          return FALSE;
     }
     else if (self->nn_param.matrixmul.transpose[0] == FALSE
         && self->nn_param.matrixmul.transpose[1] == TRUE
-        && inputs[0]->attr.size[0] != inputs[1]->attr.size[0])
+        && inputs[0]->attr.size[0] != inputs[1]->attr.size[0]
+        && inputs[0]->attr.dim_num > 1 && inputs[1]->attr.dim_num > 1)
     {
          VSILOGE("1st input tensor's size[0] is not equal to 2nd input tensor's size[0]");
          return FALSE;
@@ -195,7 +261,25 @@ static vsi_bool op_setup
             return FALSE;
         }
 
-        if (inputs[0]->attr.dim_num > inputs[1]->attr.dim_num)
+        if (inputs[0]->attr.dim_num == 1 && inputs[1]->attr.dim_num > 1)
+        {
+            outputs[0]->attr.dim_num = inputs[1]->attr.dim_num - 1;
+            outputs[0]->attr.size[0] = inputs[1]->attr.size[0];
+            for (i = 1; i < outputs[0]->attr.dim_num; i++)
+            {
+                outputs[0]->attr.size[i] = inputs[1]->attr.size[i + 1];
+            }
+        }
+        else if (inputs[1]->attr.dim_num == 1 && inputs[0]->attr.dim_num > 1)
+        {
+            outputs[0]->attr.dim_num = inputs[0]->attr.dim_num - 1;
+
+            for (i = 0; i < outputs[0]->attr.dim_num; i++)
+            {
+                outputs[0]->attr.size[i] = inputs[0]->attr.size[i + 1];
+            }
+        }
+        else if (inputs[0]->attr.dim_num > inputs[1]->attr.dim_num)
         {
             for (i = 2; i < inputs[0]->attr.dim_num; i++)
             {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c
index 2ae7605..a8d7b30 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pad.c
@@ -35,6 +35,7 @@
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_math.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "utils/vsi_nn_dtype_util.h"
 
 vsi_status vsi_nn_InitPadParameter
     (
@@ -127,6 +128,7 @@ static vsi_status op_compute
 {
     vsi_status status;
     vx_nn_pad_params_t p;
+    vsi_nn_tensor_t *convert_tensor = NULL;
 
     status = VSI_FAILURE;
     if(VSI_SUCCESS != vsi_nn_InitPadParameter(self, &p))
@@ -135,20 +137,43 @@ static vsi_status op_compute
         return VSI_FAILURE;
     }
 
+    if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
+    {
+        vsi_nn_tensor_attr_t attr;
+        memcpy( &attr, &outputs[0]->attr, sizeof( attr ) );
+        memcpy( &attr.size, &inputs[0]->attr.size, sizeof( attr.size ) );
+        attr.vtl = FALSE;
+        attr.is_const = FALSE;
+
+        convert_tensor = vsi_nn_CreateTensor(self->graph, &attr);
+
+        self->n = vxTensorCopyNode(
+            self->graph->g,
+            inputs[0]->t,
+            convert_tensor->t
+            );
+    }
+    else
+    {
+        convert_tensor = vsi_nn_reshape_tensor( self->graph,
+            inputs[0], inputs[0]->attr.size, inputs[0]->attr.dim_num );
+    }
     self->n = vxTensorPadNode(
         self->graph->g,
-        inputs[0]->t,
+        convert_tensor->t,
         outputs[0]->t,
         &p,
         sizeof(p)
         );
 
     vsi_nn_DeinitPadParameter(&p);
+    vsi_safe_release_tensor(convert_tensor);
 
     if( NULL != self->n )
     {
         status = VSI_SUCCESS;
     }
+
     return status;
 } /* op_compute() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c
index a5ce4f9..f1386c7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c
@@ -254,13 +254,8 @@ static vsi_status op_optimize
     {
         if(NULL == inputs[0]->t && NULL != outputs[0]->t)
         {
-#ifdef VSI_40BIT_VA_SUPPORT
-            inputs[0]->t = vxReshapeTensor( outputs[0]->t,
-                inputs[0]->attr.size, inputs[0]->attr.dim_num );
-#else
-            inputs[0]->t = vxReshapeTensor( outputs[0]->t,
-                (int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num );
-#endif
+            inputs[0]->t = vsi_nn_safe_reshape_tensor( outputs[0]->t,
+                (void*)inputs[0]->attr.size, (vsi_size_t)inputs[0]->attr.dim_num, sizeof(inputs[0]->attr.size[0]) );
             if( inputs[0]->t == NULL )
             {
                 status = VSI_FAILURE;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c
index a198d32..18942fa 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_post_process.c
@@ -151,9 +151,9 @@ static vsi_bool op_setup
     if (self->nn_param.post_process.local.enable_data_conv == FALSE &&
         self->nn_param.post_process.local.enable_perm == FALSE)
     {
-        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 );
-        curr->node->nn_param.reshape.size = outputs[0]->attr.size;
-        curr->node->nn_param.reshape.dim_num = outputs[0]->attr.dim_num;
+        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+        curr->node->nn_param.reshape2.size = outputs[0]->attr.size;
+        curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num;
         curr->inputs[0] = inputs[POST_PROCESS_INPUT];
         curr->outputs[0] = outputs[POST_PROCESS_OUTPUT];
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
index 9f0a995..6a955a5 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
@@ -315,7 +315,7 @@ static vsi_bool op_setup
             memcpy(&attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t));
             for(i = 0; i < p->output_attr.dim_num; i++)
             {
-                attr.size[i] = (vsi_size_t)p->output_attr.size[i];
+                attr.size[i] = -1 == p->output_attr.size[i] ? -1 : (vsi_size_t)p->output_attr.size[i];
             }
             attr.size[axis] = 1;
             attr.vtl = TRUE;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c
index 31818ee..d264ee7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c
@@ -51,7 +51,7 @@ static vsi_status op_compute
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_param_t * param = NULL;
     vsi_nn_kernel_node_t    n = NULL;
-    param =vsi_nn_kernel_param_create();
+    param = vsi_nn_kernel_param_create();
 
     vsi_nn_kernel_param_add_int32( param, "scale_x", self->nn_param.pre_process_gray.local.scale_x );
     vsi_nn_kernel_param_add_int32( param, "scale_y", self->nn_param.pre_process_gray.local.scale_y );
@@ -60,6 +60,9 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_float32( param, "mean", self->nn_param.pre_process_gray.mean );
     vsi_nn_kernel_param_add_float32( param, "scale", self->nn_param.pre_process_gray.scale );
     vsi_nn_kernel_param_add_int32( param, "enable_copy", self->nn_param.pre_process_gray.local.enable_copy );
+    vsi_nn_kernel_param_add_int32( param, "width", self->nn_param.pre_process_gray.rect.width );
+    vsi_nn_kernel_param_add_int32( param, "height", self->nn_param.pre_process_gray.rect.height );
+
     n = vsi_nn_kernel_selector( self->graph, "pre_process_gray", inputs, 1, outputs, 1, param );
     if( n != NULL )
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c
index ba50f33..b4220a7 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_tensor.c
@@ -151,9 +151,9 @@ static vsi_bool op_setup
     if (self->nn_param.pre_process_tensor.local.enable_data_conv == FALSE &&
         self->nn_param.pre_process_tensor.local.enable_perm == FALSE)
     {
-        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 );
-        curr->node->nn_param.reshape.size = outputs[0]->attr.size;
-        curr->node->nn_param.reshape.dim_num = outputs[0]->attr.dim_num;
+        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+        curr->node->nn_param.reshape2.size = outputs[0]->attr.size;
+        curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num;
         curr->inputs[0] = inputs[PRE_PROCESS_TENSOR_INPUT];
         curr->outputs[0] = outputs[PRE_PROCESS_TENSOR_OUTPUT];
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c
index 3ef8224..b66a5cf 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c
@@ -292,19 +292,7 @@ static vsi_status op_optimize
         {
             size[2] = outputs[0]->attr.size[0];
             size[3] = outputs[0]->attr.size[1];
-#ifdef VSI_40BIT_VA_SUPPORT
-            rois_tmp = vxReshapeTensor(outputs[0]->t, size, dim);
-#else
-            {
-                vsi_size_t i;
-                int32_t size_32bit[VSI_NN_MAX_DIM_NUM];
-                for(i = 0; i< VSI_NN_MAX_DIM_NUM; i++)
-                {
-                    size_32bit[i] = (int32_t)size[i];
-                }
-                rois_tmp = vxReshapeTensor(outputs[0]->t, size_32bit, dim);
-            }
-#endif
+            rois_tmp = vsi_nn_safe_reshape_tensor(outputs[0]->t, (void*)size, (vsi_size_t)dim, sizeof(size[0]));
             if(NULL == rois_tmp)
             {
                 goto error;
@@ -317,19 +305,7 @@ static vsi_status op_optimize
         {
             size[2] = outputs[1]->attr.size[0];
             size[3] = outputs[1]->attr.size[1];
-#ifdef VSI_40BIT_VA_SUPPORT
-            score_tmp = vxReshapeTensor(outputs[1]->t, size, dim);
-#else
-            {
-                vsi_size_t i;
-                int32_t size_32bit[VSI_NN_MAX_DIM_NUM];
-                for(i = 0; i< VSI_NN_MAX_DIM_NUM; i++)
-                {
-                    size_32bit[i] = (int32_t)size[i];
-                }
-                score_tmp = vxReshapeTensor(outputs[1]->t, size_32bit, dim);
-            }
-#endif
+            score_tmp = vsi_nn_safe_reshape_tensor(outputs[1]->t, (void*)size, (vsi_size_t)dim, sizeof(size[0]));
             if(NULL == score_tmp)
             {
                 goto error;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
index daeb768..d4629ec 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
@@ -33,6 +33,7 @@
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_dtype_util.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
 
@@ -47,18 +48,29 @@ static vsi_status op_compute
     *If reshape is un-initialized, we need add a tensorcopy
     * when input and output are initialized.
     */
-    if(inputs[0]->t != NULL && outputs[0]->t != NULL &&
+    if (inputs[0]->t != NULL && outputs[0]->t != NULL &&
         self->nn_param.reshape.local.initialized == FALSE)
     {
+        vsi_status status = VSI_SUCCESS;
+        vsi_nn_tensor_t *tmp_tensor = NULL;
+
+        tmp_tensor = vsi_nn_reshape_tensor( self->graph,
+            outputs[0], inputs[0]->attr.size, inputs[0]->attr.dim_num );
+
         self->n = vxTensorCopyNode(self->graph->g,
-            inputs[0]->t, outputs[0]->t);
-        if(NULL == self->n)
+            inputs[0]->t, tmp_tensor->t);
+        if (NULL == self->n)
         {
             VSILOGE( "Create vxTensorCopyNode fail." );
-            return VSI_FAILURE;
+            status = VSI_FAILURE;
         }
         VSILOGD("Create a copy node for reshape");
+
+        vsi_safe_release_tensor(tmp_tensor);
+
+        return status;
     }
+
     return VSI_SUCCESS;
 } /* op_compute() */
 
@@ -84,8 +96,11 @@ static vsi_bool op_setup
     if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0};
-        memcpy(shape, self->nn_param.reshape.size,
-            sizeof(vsi_size_t) * self->nn_param.reshape.dim_num);
+        uint32_t i = 0;
+        for(i = 0; i < self->nn_param.reshape.dim_num; i++)
+        {
+            shape[i] = -1 == self->nn_param.reshape.size[i] ? -1 : (vsi_size_t)self->nn_param.reshape.size[i];
+        }
         ret = vsi_nn_CalcReshapeTensor(inputs[0],
             outputs[0],
             shape,
@@ -108,21 +123,23 @@ static vsi_status op_optimize
 
     status = VSI_SUCCESS;
     ret = TRUE;
-    if(self->nn_param.reshape.local.initialized == FALSE)
+
+    if( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
+    {
+        return status;
+    }
+
+    if (self->nn_param.reshape.local.initialized == FALSE)
     {
         VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
-        if( direction == VSI_NN_OPTIMIZE_BACKWARD )
+        if ( direction == VSI_NN_OPTIMIZE_BACKWARD )
         {
-            if(NULL == inputs[0]->t && NULL != outputs[0]->t)
+            if (NULL == inputs[0]->t && NULL != outputs[0]->t)
             {
-#ifdef VSI_40BIT_VA_SUPPORT
-                inputs[0]->t = vxReshapeTensor( outputs[0]->t,
-                    inputs[0]->attr.size, inputs[0]->attr.dim_num );
-#else
-                inputs[0]->t = vxReshapeTensor( outputs[0]->t,
-                    (int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num );
-#endif
-                if( inputs[0]->t == NULL )
+                inputs[0]->t = vsi_nn_safe_reshape_tensor( outputs[0]->t,
+                    (void*)inputs[0]->attr.size, (vsi_size_t)inputs[0]->attr.dim_num,
+                    sizeof(inputs[0]->attr.size[0]) );
+                if ( inputs[0]->t == NULL )
                 {
                     status = VSI_FAILURE;
                 }
@@ -131,11 +148,17 @@ static vsi_status op_optimize
         }
         else
         {
-            if(NULL == outputs[0]->t)
+            if (NULL == outputs[0]->t)
             {
+                vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0};
+                uint32_t i = 0;
+                for (i = 0; i < self->nn_param.reshape.dim_num; i++)
+                {
+                    shape[i] = -1 == self->nn_param.reshape.size[i] ? -1 : (vsi_size_t)self->nn_param.reshape.size[i];
+                }
                 ret = vsi_nn_ReshapeTensor( self->graph, inputs[0], outputs[0],
-                    self->nn_param.reshape.size, self->nn_param.reshape.dim_num );
-                if( ret == FALSE )
+                    shape, self->nn_param.reshape.dim_num );
+                if ( ret == FALSE )
                 {
                     status = VSI_FAILURE;
                 }
@@ -166,4 +189,3 @@ DEF_OP_REG
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
new file mode 100644
index 0000000..4132004
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
@@ -0,0 +1,204 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_platform.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_log.h"
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    /*
+    *If reshape is un-initialized, we need add a tensorcopy
+    * when input and output are initialized.
+    */
+    if(inputs[0]->t != NULL && outputs[0]->t != NULL &&
+        self->nn_param.reshape2.local->initialized == FALSE)
+    {
+        self->n = vxTensorCopyNode(self->graph->g,
+            inputs[0]->t, outputs[0]->t);
+        if(NULL == self->n)
+        {
+            VSILOGE( "Create vxTensorCopyNode fail." );
+            return VSI_FAILURE;
+        }
+        VSILOGD("Create a copy node for reshape");
+    }
+    return VSI_SUCCESS;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    //TODO: Check tensor shapes.
+    return TRUE;
+} /* op_check() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t * self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+    self->nn_param.reshape2.local   =
+    (vsi_nn_reshape2_local_data *)malloc(sizeof(vsi_nn_reshape2_local_data));
+    if (NULL == self->nn_param.reshape2.local)
+    {
+        return  VX_ERROR_NO_MEMORY;
+    }
+    memset(self->nn_param.reshape2.local, 0, sizeof(vsi_nn_reshape2_local_data));
+    return status;
+} /* op_init() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t * self
+    )
+{
+    if (self->nn_param.reshape2.local != NULL)
+    {
+        free(self->nn_param.reshape2.local);
+        self->nn_param.reshape2.local = NULL;
+    }
+
+    vsi_nn_op_common_deinit(self);
+
+    return VSI_SUCCESS;
+} /* op_deinit() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_bool ret = TRUE;
+    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0};
+        memcpy(shape, self->nn_param.reshape2.size,
+            sizeof(vsi_size_t) * self->nn_param.reshape2.dim_num);
+        ret = vsi_nn_CalcReshapeTensor(inputs[0],
+            outputs[0],
+            shape,
+            self->nn_param.reshape2.dim_num);
+    }
+
+    return ret;
+} /* op_setup() */
+
+static vsi_status op_optimize
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    vsi_nn_opt_direction_e direction
+    )
+{
+    vsi_status status;
+    vsi_bool ret;
+
+    status = VSI_SUCCESS;
+    ret = TRUE;
+    if ( vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype) == FALSE)
+    {
+        return status;
+    }
+
+    if (self->nn_param.reshape2.local->initialized == FALSE)
+    {
+        VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
+        if ( direction == VSI_NN_OPTIMIZE_BACKWARD )
+        {
+            if (NULL == inputs[0]->t && NULL != outputs[0]->t)
+            {
+                inputs[0]->t = vsi_nn_safe_reshape_tensor( outputs[0]->t,
+                    (void*)inputs[0]->attr.size, (vsi_size_t)inputs[0]->attr.dim_num,
+                    sizeof(inputs[0]->attr.size[0]) );
+                if ( inputs[0]->t == NULL )
+                {
+                    status = VSI_FAILURE;
+                }
+                self->nn_param.reshape2.local->initialized = TRUE;
+            }
+        }
+        else
+        {
+            if (NULL == outputs[0]->t)
+            {
+                ret = vsi_nn_ReshapeTensor( self->graph, inputs[0], outputs[0],
+                    self->nn_param.reshape2.size, self->nn_param.reshape2.dim_num );
+                if ( ret == FALSE )
+                {
+                    status = VSI_FAILURE;
+                }
+                self->nn_param.reshape2.local->initialized = TRUE;
+            }
+        }
+    }
+
+    return status;
+} /* op_optimize() */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ RESHAPE2,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ op_optimize,
+    /* input_num  */ 1,
+    /* output_num */ 1
+    );
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
index 9454c42..ad39a8b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c
@@ -81,10 +81,7 @@ static vsi_status op_compute
 {
     vsi_status status = VSI_FAILURE;
 
-    if ( ((self->nn_param.resize.align_corners || self->nn_param.resize.half_pixel_centers)
-       && (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize.type
-          || VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR == self->nn_param.resize.type))
-       || _is_same_shape(inputs[0], outputs[0]->attr.size, outputs[0]->attr.dim_num) )
+    if ( self->nn_param.resize.lcl_data->use_internal_node )
     {
         status = vsi_nn_internal_compute_node( self );
     }
@@ -121,10 +118,7 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
-    if ( ((self->nn_param.resize.align_corners || self->nn_param.resize.half_pixel_centers)
-       && (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize.type
-          || VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR == self->nn_param.resize.type) )
-        || _is_same_shape(inputs[0], outputs[0]->attr.size, outputs[0]->attr.dim_num) )
+    if ( self->nn_param.resize.lcl_data->use_internal_node )
     {
         return vsi_nn_internal_optimize_node(self, direction );
     }
@@ -154,6 +148,7 @@ static vsi_bool op_setup
 {
     /* TODO: Add code to comput outputs' shape. */
     float factor = self->nn_param.resize.factor;
+    vsi_enum layout = self->nn_param.resize.layout;
     vsi_nn_internal_node_t* curr = NULL;
 
     if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
@@ -161,26 +156,55 @@ static vsi_bool op_setup
         outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
         if (factor != 0)
         {
-            outputs[0]->attr.size[0] = (uint32_t)(inputs[0]->attr.size[0] * factor);
-            outputs[0]->attr.size[1] = (uint32_t)(inputs[0]->attr.size[1] * factor);
+            if (layout == VSI_NN_RESIZE_LAYOUT_NCHW)
+            {
+                outputs[0]->attr.size[0] = (uint32_t)(inputs[0]->attr.size[0] * factor);
+                outputs[0]->attr.size[1] = (uint32_t)(inputs[0]->attr.size[1] * factor);
+            }
+            else
+            {
+                outputs[0]->attr.size[1] = (uint32_t)(inputs[0]->attr.size[1] * factor);
+                outputs[0]->attr.size[2] = (uint32_t)(inputs[0]->attr.size[2] * factor);
+            }
         }
         else
         {
-            outputs[0]->attr.size[0] = self->nn_param.resize.size[0];
-            outputs[0]->attr.size[1] = self->nn_param.resize.size[1];
+            if (layout == VSI_NN_RESIZE_LAYOUT_NCHW)
+            {
+                outputs[0]->attr.size[0] = self->nn_param.resize.size[0];
+                outputs[0]->attr.size[1] = self->nn_param.resize.size[1];
+            }
+            else
+            {
+                outputs[0]->attr.size[1] = self->nn_param.resize.size[0];
+                outputs[0]->attr.size[2] = self->nn_param.resize.size[1];
+            }
+        }
+        if (layout == VSI_NN_RESIZE_LAYOUT_NCHW)
+        {
+            outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
+            outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
+        }
+        else
+        {
+            outputs[0]->attr.size[0] = inputs[0]->attr.size[0];
+            outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
         }
-        outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
-        outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
     }
 
-    if ((self->nn_param.resize.align_corners || self->nn_param.resize.half_pixel_centers)
-       && (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize.type))
+    if ( ( self->nn_param.resize.align_corners ||
+           self->nn_param.resize.half_pixel_centers ||
+           layout == VSI_NN_RESIZE_LAYOUT_NHWC )
+       && ( VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize.type ) )
     {
+        self->nn_param.resize.lcl_data->use_internal_node = TRUE;
+
         vsi_nn_internal_init_node_wksp( self );
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_INTERNAL, 0, 0 );
         curr->node->nn_param.resize_internal.align_corners = self->nn_param.resize.align_corners;
         curr->node->nn_param.resize_internal.factor = self->nn_param.resize.factor;
         curr->node->nn_param.resize_internal.half_pixel_centers = self->nn_param.resize.half_pixel_centers;
+        curr->node->nn_param.resize_internal.layout = self->nn_param.resize.layout;
         curr->inputs[0]  = inputs[0];
         curr->outputs[0] = outputs[0];
         vsi_nn_internal_setup_node(self, curr);
@@ -188,6 +212,8 @@ static vsi_bool op_setup
     else if ((self->nn_param.resize.align_corners || self->nn_param.resize.half_pixel_centers)
             && (VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR == self->nn_param.resize.type))
     {
+        self->nn_param.resize.lcl_data->use_internal_node = TRUE;
+
         vsi_nn_internal_init_node_wksp( self );
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESIZE_NEAREST_INTERNAL, 0, 0 );
         curr->node->nn_param.resize_nearest_internal.align_corners = self->nn_param.resize.align_corners;
@@ -199,6 +225,8 @@ static vsi_bool op_setup
     }
     else if (_is_same_shape(inputs[0], outputs[0]->attr.size, outputs[0]->attr.dim_num))
     {
+        self->nn_param.resize.lcl_data->use_internal_node = TRUE;
+
         vsi_nn_internal_init_node_wksp( self );
         curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 );
         curr->inputs[0]  = inputs[0];
@@ -214,14 +242,15 @@ static vsi_status op_deinit
     vsi_nn_node_t * self
     )
 {
-    if ((self->nn_param.resize.align_corners || self->nn_param.resize.half_pixel_centers)
-       && (VSI_NN_INTERPOLATION_BILINEAR == self->nn_param.resize.type
-          || VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR == self->nn_param.resize.type))
+
+    if (self->nn_param.resize.lcl_data->use_internal_node)
     {
+        vsi_nn_safe_free(self->nn_param.resize.lcl_data);
         vsi_nn_internal_deinit_node_wksp(self);
     }
     else
     {
+        vsi_nn_safe_free(self->nn_param.resize.lcl_data);
         vsi_nn_op_common_deinit(self);
     }
 
@@ -235,12 +264,25 @@ static vsi_status op_init
 {
     vsi_status status = VSI_SUCCESS;
 
+    self->nn_param.resize.lcl_data =
+        (vsi_nn_resize_local_data *)malloc( sizeof(vsi_nn_resize_local_data) );
+    if( NULL == self->nn_param.resize.lcl_data )
+    {
+        VSILOGE( "Create resize local data fail." );
+        status = VSI_FAILURE;
+        goto final;
+    }
+    memset( self->nn_param.resize.lcl_data, 0, sizeof(vsi_nn_resize_local_data) );
+
     if (vsi_nn_compareVersion(self->graph, 1, 1, 14) == -1)
     {
         self->nn_param.resize.align_corners      = FALSE;
         self->nn_param.resize.half_pixel_centers = FALSE;
     }
 
+    self->nn_param.resize.layout = VSI_NN_RESIZE_LAYOUT_NCHW;
+
+final:
     return status;
 } /* op_init() */
 
@@ -263,4 +305,3 @@ DEF_OP_REG
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c
index bd761d0..efa21d6 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c
@@ -49,10 +49,10 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-
     vsi_status status = VSI_FAILURE;
     int32_t  align_corners       = self->nn_param.resize_internal.align_corners;
     int32_t  half_pixel_centers  = self->nn_param.resize_internal.half_pixel_centers;
+    vsi_enum layout = self->nn_param.resize_internal.layout;
     vsi_nn_kernel_param_t * param = NULL;
 
     param = vsi_nn_kernel_param_create();
@@ -60,10 +60,20 @@ static vsi_status op_compute
     vsi_nn_kernel_param_add_int32( param, "align_corners",  align_corners );
     vsi_nn_kernel_param_add_int32( param, "half_pixel_centers",  half_pixel_centers );
 
-    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
-            "resize_bilinear",
-            &inputs[0], 1,
-            &outputs[0], 1, param );
+    if (layout == VSI_NN_RESIZE_LAYOUT_NCHW)
+    {
+        self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+                "resize_bilinear",
+                &inputs[0], 1,
+                &outputs[0], 1, param );
+    }
+    else
+    {
+        self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
+                "resize_bilinear_nhwc",
+                &inputs[0], 1,
+                &outputs[0], 1, param );
+    }
 
     if( self->n )
     {
@@ -73,7 +83,6 @@ static vsi_status op_compute
     vsi_nn_kernel_param_release( &param );
 
     return status;
-
 } /* op_compute() */
 
 static vsi_bool op_check
@@ -113,22 +122,47 @@ static vsi_bool op_setup
 {
     /* TODO: Add code to comput outputs' shape. */
     float factor = self->nn_param.resize_internal.factor;
+    vsi_enum layout = self->nn_param.resize_internal.layout;
 
     if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
         if (factor != 0)
         {
-            outputs[0]->attr.size[0] = (uint32_t)(inputs[0]->attr.size[0] * factor);
-            outputs[0]->attr.size[1] = (uint32_t)(inputs[0]->attr.size[1] * factor);
+            if (layout == VSI_NN_RESIZE_LAYOUT_NCHW)
+            {
+                outputs[0]->attr.size[0] = (uint32_t)(inputs[0]->attr.size[0] * factor);
+                outputs[0]->attr.size[1] = (uint32_t)(inputs[0]->attr.size[1] * factor);
+            }
+            else
+            {
+                outputs[0]->attr.size[1] = (uint32_t)(inputs[0]->attr.size[1] * factor);
+                outputs[0]->attr.size[2] = (uint32_t)(inputs[0]->attr.size[2] * factor);
+            }
         }
         else
         {
-            outputs[0]->attr.size[0] = self->nn_param.resize.size[0];
-            outputs[0]->attr.size[1] = self->nn_param.resize.size[1];
+            if (layout == VSI_NN_RESIZE_LAYOUT_NCHW)
+            {
+                outputs[0]->attr.size[0] = self->nn_param.resize.size[0];
+                outputs[0]->attr.size[1] = self->nn_param.resize.size[1];
+            }
+            else
+            {
+                outputs[0]->attr.size[1] = self->nn_param.resize.size[0];
+                outputs[0]->attr.size[2] = self->nn_param.resize.size[1];
+            }
+        }
+        if (layout == VSI_NN_RESIZE_LAYOUT_NCHW)
+        {
+            outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
+            outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
+        }
+        else
+        {
+            outputs[0]->attr.size[0] = inputs[0]->attr.size[0];
+            outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
         }
-        outputs[0]->attr.size[2] = inputs[0]->attr.size[2];
-        outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
     }
     return TRUE;
 } /* op_setup() */
@@ -138,12 +172,6 @@ static vsi_status op_deinit
     vsi_nn_node_t * self
     )
 {
-    if (self->nn_param.resize_internal.lcl_data_ptr)
-    {
-        free(self->nn_param.resize_internal.lcl_data_ptr);
-        self->nn_param.resize_internal.lcl_data_ptr = NULL;
-    }
-
     vsi_nn_op_common_deinit(self);
 
     return VSI_SUCCESS;
@@ -157,13 +185,8 @@ static vsi_status op_init
 {
     vsi_status status = VSI_SUCCESS;
 
-    self->nn_param.resize_internal.lcl_data_ptr   = \
-                (vsi_nn_resize_in_lcl_data *)malloc(sizeof(vsi_nn_resize_in_lcl_data));
-    if (NULL == self->nn_param.resize_internal.lcl_data_ptr)
-    {
-        return  VX_ERROR_NO_MEMORY;
-    }
-    memset(self->nn_param.resize_internal.lcl_data_ptr, 0, sizeof(vsi_nn_resize_in_lcl_data));
+    self->nn_param.resize_internal.layout = VSI_NN_RESIZE_LAYOUT_NCHW;
+
     return status;
 } /* op_init() */
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c
index 7c39210..78c3886 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c
@@ -31,6 +31,7 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_log.h"
+#include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_constraint_check.h"
 
@@ -170,19 +171,7 @@ static vsi_status op_optimize
         {
             size[2] = inputs[1]->attr.size[0];
             size[3] = inputs[1]->attr.size[1];
-#ifdef VSI_40BIT_VA_SUPPORT
-            rois_tmp = vxReshapeTensor(inputs[1]->t, size, dim);
-#else
-            {
-                vsi_size_t i;
-                int32_t size_32bit[VSI_NN_MAX_DIM_NUM];
-                for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
-                {
-                    size_32bit[i] = (int32_t)size[i];
-                }
-                rois_tmp = vxReshapeTensor(inputs[1]->t, size_32bit, dim);
-            }
-#endif
+            rois_tmp = vsi_nn_safe_reshape_tensor(inputs[1]->t, (void*)size, (vsi_size_t)dim, sizeof(size[0]));
             if(NULL == rois_tmp)
             {
                 return VSI_FAILURE;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c
index e2897e4..d8c0c8d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c
@@ -110,6 +110,7 @@ static vsi_bool op_check
         IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP,  D_I16|Q_DFP)
         IO_TYPE(D_I16|Q_DFP, D_I32, D_I16|Q_DFP,  D_F16)
         IO_TYPE(D_F16, D_I32, D_F16, D_F16)
+        IO_TYPE(D_F16, D_I32, D_F16, D_U8|Q_ASYM)
         IO_TYPE(D_BF16, D_I32, D_BF16, D_BF16)
         IO_TYPE(D_I32, D_I32, D_I32, D_I32)
         IO_TYPE(D_U32, D_I32, D_U32, D_U32)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c b/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c
index 4d8d7fc..ea5373b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c
@@ -55,7 +55,7 @@ static vsi_status op_compute
     vx_nn_reorg_params_ext2_t param;
     vsi_nn_tensor_t *block_size_tensor = NULL;
     vsi_nn_tensor_attr_t attr;
-    uint8_t data = 1;
+    int32_t data[2] = {1, 1};
 
     memset(&param, 0, sizeof(vx_nn_reorg_params_ext2_t));
     memset(&attr, 0, sizeof(attr));
@@ -66,9 +66,9 @@ static vsi_status op_compute
     attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
     block_size_tensor = vsi_nn_CreateTensorFromData(
         self->graph,
-        &data,
+        (uint8_t *)data,
         &attr);
-    if( NULL == block_size_tensor )
+    if ( NULL == block_size_tensor )
     {
         VSILOGE("Create block_size_tensor fail.(shufflechannel)");
         return VSI_FAILURE;
@@ -87,7 +87,7 @@ static vsi_status op_compute
         sizeof(vx_nn_reorg_params_ext2_t),
         outputs[0]->t);
 
-    if( NULL != self->n )
+    if ( NULL != self->n )
     {
         status = VSI_SUCCESS;
     }
@@ -257,4 +257,3 @@ DEF_OP_REG
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_sigmoid.c b/src/tim/vx/internal/src/ops/vsi_nn_op_sigmoid.c
index ff9d84e..09a735a 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_sigmoid.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_sigmoid.c
@@ -32,7 +32,7 @@
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
-
+#include "kernel/vsi_nn_kernel.h"
 
 static vsi_status op_compute
     (
@@ -41,22 +41,16 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status;
-    status = VSI_FAILURE;
+    vsi_status status = VSI_SUCCESS;
+    vsi_nn_kernel_node_t    n;
 
-    self->n = vxActivationLayer(
-        self->graph->g,
-        inputs[0]->t,
-        VX_CONVOLUTIONAL_NETWORK_ACTIVATION_LOGISTIC,
-        0,
-        0,
-        outputs[0]->t
-        );
-
-    if( NULL != self->n )
+    n = vsi_nn_kernel_selector( self->graph, "sigmoid", inputs, 1, outputs, 1, NULL );
+    if( n == NULL )
     {
-        status = VSI_SUCCESS;
+        status = VSI_FAILURE;
     }
+    self->n = (vx_node)n;
+
     return status;
 } /* op_compute() */
 
@@ -93,4 +87,3 @@ DEF_OP_REG
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c
index 9d6d9d5..257f1e2 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c
@@ -37,27 +37,6 @@
 #include "utils/vsi_nn_math.h"
 #include "utils/vsi_nn_constraint_check.h"
 
-static vsi_bool _is_same_shape
-    (
-    vsi_nn_tensor_t * inputs,
-    vsi_size_t *sizes,
-    uint32_t dims
-    )
-{
-    uint32_t i = 0;
-
-    if (inputs->attr.dim_num != dims)
-        return FALSE;
-
-    for (i = 0; i < dims; i++)
-    {
-        if (sizes[i] != inputs->attr.size[i])
-            return FALSE;
-    }
-
-    return TRUE;
-}
-
 static vsi_status op_compute
     (
     vsi_nn_node_t * self,
@@ -128,88 +107,14 @@ static vsi_status op_optimize
     vsi_nn_opt_direction_e direction
     )
 {
-    vsi_nn_internal_node_t* curr = NULL;
-    vsi_nn_softmax_param * p;
-    uint32_t dim_num;
-    vsi_size_t sizes[VSI_NN_MAX_DIM_NUM] = {1};
-    uint32_t i = 0;
-    int32_t axis = -1;
-    vsi_nn_tensor_t* new_input = NULL;
-    vsi_nn_tensor_t* new_output = NULL;
-
     if (VSI_NN_OPTIMIZE_BACKWARD == direction)
     {
         return VSI_SUCCESS;
     }
 
-    p = &(self->nn_param.softmax);
-    axis = p->axis;
-    if (axis != VSI_NN_SOFTMAX_DEFAULT_AXIS)
-    {
-        vsi_size_t innerSize = 1;
-        vsi_size_t outerSize = 1;
-        for (i = 0; i < (uint32_t)axis; i++)
-        {
-            sizes[i] = inputs[0]->attr.size[i];
-            innerSize *= inputs[0]->attr.size[i];
-        }
-
-        for (i = (uint32_t)(axis + 1); i < inputs[0]->attr.dim_num; i++)
-        {
-            outerSize *= inputs[0]->attr.size[i];
-        }
-
-        if (axis == 1)
-        {
-            if (sizes[0] == 1)
-            {
-                sizes[0] = inputs[0]->attr.size[axis];
-                sizes[1] = outerSize;
-
-                dim_num = 2;
-            }
-            else
-            {
-                sizes[axis] = 1;
-                sizes[axis + 1] = inputs[0]->attr.size[axis];
-                sizes[axis + 2] = outerSize;
-
-                dim_num = 4;
-            }
-        }
-        else if (axis >= 3)
-        {
-            sizes[0] = innerSize;
-            sizes[1] = 1;
-            sizes[2] = inputs[0]->attr.size[axis];
-            sizes[3] = outerSize;
-
-            dim_num = vsi_nn_min(4, inputs[0]->attr.dim_num);
-        }
-        else
-        {
-            sizes[axis] = inputs[0]->attr.size[axis];
-            sizes[axis + 1] = outerSize;
-
-            dim_num = vsi_nn_min((uint32_t)(axis + 2), inputs[0]->attr.dim_num);
-        }
-    }
-
-    if (axis != VSI_NN_SOFTMAX_DEFAULT_AXIS && _is_same_shape(inputs[0], sizes, dim_num) == FALSE)
-    {
-        new_input = vsi_nn_reshape_tensor(self->graph, inputs[0], sizes, dim_num);
-        new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], sizes, dim_num);
-        curr = ((vsi_nn_internal_node_wksp_t *)((self)->internal_node_wksp))->nodes;
-        curr->inputs[0] = new_input;
-        curr->outputs[0] = new_output;
-        p->local.reshaped_input = new_input;
-        p->local.reshaped_output = new_output;
-    }
-
     return vsi_nn_internal_optimize_node( self, direction );
 } /* op_optimize() */
 
-
 static vsi_status op_deinit
     (
     vsi_nn_node_t * self
@@ -237,10 +142,6 @@ static vsi_status op_init
 {
     vsi_status status = VSI_SUCCESS;
 
-    if (vsi_nn_compareVersion(self->graph, 1, 1, 7) == -1)
-    {
-        self->nn_param.softmax.axis = VSI_NN_SOFTMAX_DEFAULT_AXIS;
-    }
     if (self->nn_param.softmax.beta == 0.f)
     {
         self->nn_param.softmax.beta = 1.f;
@@ -262,6 +163,18 @@ static vsi_bool op_setup
         return FALSE;
     }
 
+    if (vsi_nn_compareVersion(self->graph, 1, 1, 7) == -1)
+    {
+        if (inputs[0]->attr.dim_num < 3)
+        {
+            self->nn_param.softmax.axis = 0;
+        }
+        else
+        {
+            self->nn_param.softmax.axis = 2;
+        }
+    }
+
     if (self->nn_param.softmax.axis < 0)
         self->nn_param.softmax.axis += (int32_t)inputs[0]->attr.dim_num;
 
@@ -276,6 +189,7 @@ static vsi_bool op_setup
     curr->inputs[0] = inputs[0];
     curr->outputs[0] = outputs[0];
     curr->node->nn_param.softmax_internal.beta = self->nn_param.softmax.beta;
+    curr->node->nn_param.softmax_internal.axis = self->nn_param.softmax.axis;
     vsi_nn_internal_setup_node(self, curr);
 
     return TRUE;
@@ -300,4 +214,3 @@ DEF_OP_REG
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c
index ce0b2e4..7d0824d 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c
@@ -180,21 +180,23 @@ static vsi_status op_optimize
     in_view_tensor = NULL;
     out_view_tensor = NULL;
     status = VSI_SUCCESS;
-    if(direction == VSI_NN_OPTIMIZE_BACKWARD)
+    if (direction == VSI_NN_OPTIMIZE_BACKWARD)
     {
         return status;
     }
-    if(_need_split_softmax(self, inputs) == FALSE)
+    if ( _need_split_softmax(self, inputs) == FALSE ||
+         self->nn_param.softmax_internal.axis != 0 ||
+         self->graph->ctx->config.support_stream_processor )
     {
         return status;
     }
 
     VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
-    if( NULL == inputs[0]->t )
+    if ( NULL == inputs[0]->t )
     {
         vsi_nn_TensorReinit( self->graph, inputs[0] );
     }
-    if( NULL == outputs[0]->t )
+    if ( NULL == outputs[0]->t )
     {
         vsi_nn_TensorReinit( self->graph, outputs[0] );
     }
@@ -208,11 +210,11 @@ static vsi_status op_optimize
     end[2] = inputs[0]->attr.size[2];
     end[3] = inputs[0]->attr.size[3];
     end[axis] = 0;
-    while(end[axis] < batch_size)
+    while (end[axis] < batch_size)
     {
         start[axis] = end[axis];
         end[axis] += MAX_SOFTMAX_BATCH;
-        if(end[axis] > inputs[0]->attr.size[axis])
+        if (end[axis] > inputs[0]->attr.size[axis])
         {
             end[axis] = inputs[0]->attr.size[axis];
         }
@@ -224,14 +226,14 @@ static vsi_status op_optimize
             break;
         }
         out_view_tensor = vsi_nn_CreateViewTensor(self->graph, start, end, outputs[0]);
-        if(NULL == out_view_tensor)
+        if (NULL == out_view_tensor)
         {
             VSILOGE( "Create outputs view tensor fail.");
             break;
         }
 
         status = _create_split_softmax(self, in_view_tensor, out_view_tensor);
-        if(VSI_SUCCESS != status)
+        if (VSI_SUCCESS != status)
         {
             VSILOGE( "Create split softmax data struct fail.");
             break;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_square.c b/src/tim/vx/internal/src/ops/vsi_nn_op_square.c
index 1e0144c..86d46dd 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_square.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_square.c
@@ -31,9 +31,8 @@
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
-#include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-
+#include "utils/vsi_nn_util.h"
 
 static vsi_status op_compute
     (
@@ -42,14 +41,13 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status = VSI_FAILURE;
+    vsi_status status = VSI_SUCCESS;
+    vsi_nn_kernel_node_t n;
 
-    self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "square",
-        inputs, 1, outputs, 1, NULL );
-
-    if( self->n )
+    n = vsi_nn_kernel_selector( self->graph, "square", inputs, 1, outputs, 1, NULL );
+    if ( n == NULL )
     {
-        status = VSI_SUCCESS;
+        status = VSI_FAILURE;
     }
 
     return status;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c b/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c
index fb4bcf7..250f4f3 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_squeeze.c
@@ -143,9 +143,9 @@ static vsi_bool op_setup
     }
 
     vsi_nn_internal_init_node_wksp( self );
-    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 );
-    curr->node->nn_param.reshape.size = outputs[0]->attr.size;
-    curr->node->nn_param.reshape.dim_num = outputs[0]->attr.dim_num;
+    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+    curr->node->nn_param.reshape2.size = outputs[0]->attr.size;
+    curr->node->nn_param.reshape2.dim_num = outputs[0]->attr.dim_num;
     curr->inputs[0] = inputs[0];
     curr->outputs[0] = outputs[0];
     vsi_nn_internal_setup_node( self, curr );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
index f743a52..c0a0562 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
@@ -255,11 +255,8 @@ static vsi_status copy_tensor_to_view
     data->src_tensor = src_tensor;
     if (dst_in->t)
     {
-#ifdef VSI_40BIT_VA_SUPPORT
-        data->dst_tensor = vxReshapeTensor(dst_in->t, dst_in->attr.size, dst_in->attr.dim_num);
-#else
-        data->dst_tensor = vxReshapeTensor(dst_in->t, (int32_t*)dst_in->attr.size, dst_in->attr.dim_num);
-#endif
+        data->dst_tensor = vsi_nn_safe_reshape_tensor(dst_in->t, (void*)dst_in->attr.size,
+            (vsi_size_t)dst_in->attr.dim_num, sizeof(dst_in->attr.size[0]));
     }
 
     data->is_dataconvert_op = TRUE;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tanh.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tanh.c
index 850a2d1..a953651 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_tanh.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tanh.c
@@ -32,7 +32,7 @@
 #include "vsi_nn_tensor.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
-
+#include "kernel/vsi_nn_kernel.h"
 
 static vsi_status op_compute
     (
@@ -41,22 +41,23 @@ static vsi_status op_compute
     vsi_nn_tensor_t ** outputs
     )
 {
-    vsi_status status;
-    status = VSI_FAILURE;
+    vsi_status status = VSI_SUCCESS;
+    vsi_nn_kernel_param_t * param;
+    vsi_nn_kernel_node_t    n;
+    param = vsi_nn_kernel_param_create();
 
-    self->n = vxActivationLayer(
-        self->graph->g,
-        inputs[0]->t,
-        VX_CONVOLUTIONAL_NETWORK_ACTIVATION_HYPERBOLIC_TAN,
-        self->nn_param.tanh.scale_a,
-        self->nn_param.tanh.scale_b,
-        outputs[0]->t
-        );
+    vsi_nn_kernel_param_add_float32( param, "scale_a", self->nn_param.tanh.scale_a );
+    vsi_nn_kernel_param_add_float32( param, "scale_b", self->nn_param.tanh.scale_b );
 
-    if( NULL != self->n )
+    n = vsi_nn_kernel_selector( self->graph, "tanh", inputs, 1, outputs, 1, param );
+    if( n == NULL )
     {
-        status = VSI_SUCCESS;
+        vsi_nn_kernel_param_release( &param );
+        status = VSI_FAILURE;
     }
+    self->n = (vx_node)n;
+    vsi_nn_kernel_param_release( &param );
+
     return status;
 } /* op_compute() */
 
@@ -93,4 +94,3 @@ DEF_OP_REG
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c
index f62ac51..676326b 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c
@@ -168,15 +168,15 @@ static vsi_bool op_setup
     vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, use_virtual_tensor);
     input_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
 
-    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 );
+    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
     reshape_input_size = (vsi_size_t*)vsi_nn_internal_new_node_param(curr,
         VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
     reshape_input_size[0] = block_size;
     reshape_input_size[1] = tensor_num;
     reshape_input_size[2] = block_num;
 
-    curr->node->nn_param.reshape.size = reshape_input_size;
-    curr->node->nn_param.reshape.dim_num = 3;
+    curr->node->nn_param.reshape2.size = reshape_input_size;
+    curr->node->nn_param.reshape2.dim_num = 3;
     curr->inputs[0] = inputs[0];
     curr->outputs[0] = input_tensor->t;
     vsi_nn_internal_setup_node( self, curr );
@@ -208,9 +208,9 @@ static vsi_bool op_setup
 
         memcpy(output_size, outputs[i]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
 
-        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 );
-        curr->node->nn_param.reshape.size = output_size;
-        curr->node->nn_param.reshape.dim_num = outputs[i]->attr.dim_num;
+        curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
+        curr->node->nn_param.reshape2.size = output_size;
+        curr->node->nn_param.reshape2.dim_num = outputs[i]->attr.dim_num;
         curr->inputs[0] = output_tensors[i]->t;
         curr->outputs[0] = outputs[i];
         vsi_nn_internal_setup_node( self, curr );
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c b/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c
index 8879471..d213bb9 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c
@@ -101,14 +101,11 @@ static vsi_status op_optimize
         vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype))
     {
         VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid);
-#ifdef VSI_40BIT_VA_SUPPORT
-        outputs[0]->t = vxReshapeTensor(inputs[0]->t, outputs[0]->attr.size, outputs[0]->attr.dim_num);
-#else
-        outputs[0]->t = vxReshapeTensor(inputs[0]->t, (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num);
-#endif
+        outputs[0]->t = vsi_nn_safe_reshape_tensor(inputs[0]->t, (void*)outputs[0]->attr.size,
+            (vsi_size_t)outputs[0]->attr.dim_num, sizeof(outputs[0]->attr.size[0]));
         if( NULL == outputs[0]->t )
         {
-            VSILOGE("Call vxReshapeTensor fail");
+            VSILOGE("Call vsi_nn_safe_reshape_tensor fail");
             free(local);
             local = NULL;
             return VSI_FAILURE;
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
index 2e043b6..9f8ca77 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
@@ -111,7 +111,7 @@ static void _try_pack_tensor_data
             cnt = fwrite( data, (size_t)bytes, 1, s_dfile_hndl );
             if( cnt != 1 )
             {
-                VSILOGW( "Write tensor bytes(%"VSI_SIZE_T_SPECIFIER"/%d)", (vsi_size_t)cnt, 1 );
+                VSILOGW( "Write tensor bytes(%"SIZE_T_SPECIFIER"/%d)", cnt, 1 );
             }
             if( cnt > 0 )
             {
@@ -435,6 +435,8 @@ static _op_param_gen_t s_op_gen[] =
     /* GRU */                   NULL,
     /* GRUCELL */               NULL,
     /* GRUCELL_ACTIVATION */    NULL,
+    /* RESHAPE2 */              NULL,
+    /* CONV3D */                NULL,
 };
 _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );
 
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
index 384981b..acca854 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c
@@ -156,6 +156,8 @@ static inline void _convert_float_to_bfloat16
         return TRUE; \
     }
 
+DEF_DTYPE_CONVERT_QUANTIZE( asymmi4, int8_t,   vsi_rtne, -8,        7 )
+DEF_DTYPE_CONVERT_QUANTIZE( asymm4,  uint8_t,  vsi_rtne, 0,         0xF )
 DEF_DTYPE_CONVERT_QUANTIZE( symm8,   int8_t,   vsi_rtne, SCHAR_MIN, SCHAR_MAX )
 DEF_DTYPE_CONVERT_QUANTIZE( symm16,  int16_t,  vsi_rtne, SHRT_MIN,  SHRT_MAX  )
 DEF_DTYPE_CONVERT_QUANTIZE( symm32,  int32_t,  vsi_rtne, INT_MIN,   INT_MAX   )
@@ -256,6 +258,12 @@ vsi_bool vsi_nn_dtype_convert_float_to_quantize_asymm
 {
     switch( dtype )
     {
+        case I4:
+            return vsi_nn_dtype_convert_float_to_quantize_asymmi4(
+                    buffer, size, scale, zero_point, (int8_t*)out_buffer );
+        case U4:
+            return vsi_nn_dtype_convert_float_to_quantize_asymm4(
+                    buffer, size, scale, zero_point, (uint8_t*)out_buffer );
         case U8:
             return vsi_nn_dtype_convert_float_to_quantize_asymm8(
                     buffer, size, scale, zero_point, (uint8_t*)out_buffer );
@@ -396,6 +404,12 @@ vsi_bool vsi_nn_dtype_convert_quantize_asymm_to_float
 {
     switch( dtype )
     {
+        case I4:
+            return vsi_nn_dtype_convert_quantize_asymmi4_to_float(
+                    (const int8_t *)buffer, size, scale, zero_point, out_buffer );
+        case U4:
+            return vsi_nn_dtype_convert_quantize_asymm4_to_float(
+                    (const uint8_t *)buffer, size, scale, zero_point, out_buffer );
         case U8:
             return vsi_nn_dtype_convert_quantize_asymm8_to_float(
                     (const uint8_t *)buffer, size, scale, zero_point, out_buffer );
@@ -481,4 +495,3 @@ vsi_bool vsi_nn_dtype_convert_quantize_symm_perchannel_to_float
     }
     return TRUE;
 } /* vsi_nn_dtype_convert_quantize_symm_perchannel_to_float() */
-
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c
index 6144845..3c45846 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c
@@ -329,6 +329,23 @@ uint32_t vsi_nn_TypeGetBytes
     return type_get_bytes( type );
 } /* vsi_nn_TypeGetBytes() */
 
+uint32_t vsi_nn_TypeGetBytesExt
+    (
+    const vsi_nn_type_e type
+    )
+{
+    uint32_t bits_num = 0;
+    bits_num = vsi_nn_TypeGetBits(type);
+    if(bits_num < BITS_PER_BYTE)
+    {
+        return 1;
+    }
+    else
+    {
+        return bits_num / BITS_PER_BYTE;
+    }
+}
+
 /*
 * Deprecated: use vsi_nn_TypeGetBytes() insteatd.
 */
@@ -340,6 +357,14 @@ uint32_t vsi_nn_GetTypeBytes
     return type_get_bytes( type );
 } /* vsi_nn_GetTypeBytes() */
 
+uint32_t vsi_nn_TypeGetBits
+    (
+    const vsi_nn_type_e type
+    )
+{
+    return type_get_bits(type);
+} /* vsi_nn_GetTypeBits() */
+
 vsi_bool vsi_nn_QuantCheck
     (
     vsi_nn_tensor_t *input,
@@ -386,6 +411,7 @@ vsi_bool vsi_nn_QuantCheck
                 bias->attr.dtype.fl);
         }
         break;
+    case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
     case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
     if (weight->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC)
     {
@@ -437,7 +463,8 @@ vsi_bool vsi_nn_DtypeCompare
             return FALSE;
         }
     }
-    else if(dtype0->qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC)
+    else if( dtype0->qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC ||
+             dtype0->qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC )
     {
         const float diff = (float)1e-5;
         if(dtype0->zero_point != dtype1->zero_point)
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_util.c b/src/tim/vx/internal/src/utils/vsi_nn_util.c
index 4b6fded..b05fdab 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_util.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c
@@ -188,7 +188,6 @@ vsi_size_t vsi_nn_GetStrideSize
     vsi_size_t            * stride
     )
 {
-
     if( NULL == attr || NULL == stride )
     {
         return 0;
@@ -207,20 +206,45 @@ vsi_size_t vsi_nn_GetStrideSizeBySize
 {
     vsi_size_t total_bytes;
     vsi_size_t i;
+    vsi_size_t type_bits;
 
     if( NULL == size || NULL == stride )
     {
         return 0;
     }
-
-    stride[0] = vsi_nn_GetTypeBytes( type );
+    type_bits = vsi_nn_TypeGetBits( type);
+    stride[0] = type_bits / BITS_PER_BYTE;
     total_bytes = stride[0];
-    for( i = 1; i < dim_num; i ++ )
+    if( type_bits < BITS_PER_BYTE )
     {
-        stride[i] = size[i - 1] * stride[i - 1];
-        total_bytes *= size[i];
+        total_bytes = 1;
+        if( size[0] % (BITS_PER_BYTE / type_bits) == 0 )
+        {
+             stride[1] = size[0] * type_bits / BITS_PER_BYTE;
+        }
+        else
+        {
+             stride[1] = size[0] * type_bits / BITS_PER_BYTE + 1;
+        }
+
+        total_bytes *= stride[1];
+        for(i = 2; i < dim_num; i++)
+        {
+            stride[i] = size[i-1] * stride[i-1];
+            total_bytes *= size[i];
+        }
+        total_bytes *= size[1];
     }
-    total_bytes *= size[0];
+    else
+    {
+        for( i = 1; i < dim_num; i ++ )
+        {
+            stride[i] = size[i - 1] * stride[i - 1];
+            total_bytes *= size[i];
+        }
+        total_bytes *= size[0];
+    }
+
     for( i = dim_num; i < VSI_NN_MAX_DIM_NUM; i ++ )
     {
         stride[i] = total_bytes;
@@ -254,6 +278,8 @@ float vsi_nn_DataAsFloat32
     case VSI_NN_TYPE_BOOL8:
         val = (float)((int8_t*)data)[0];
         break;
+    case VSI_NN_TYPE_INT4:
+
     case VSI_NN_TYPE_INT8:
         val = (float)((int8_t*)data)[0];
         break;
@@ -327,7 +353,6 @@ void vsi_nn_UpdateTensorDims
     }
 } /* vsi_nn_UpdateTensorDims() */
 
-
 vsi_size_t vsi_nn_ComputeFilterSize
     (
     vsi_size_t   i_size,
@@ -380,6 +405,26 @@ vsi_size_t vsi_nn_compute_filter_shape
     }
 } /* vsi_nn_compute_filter_shape() */
 
+void vsi_nn_compute_padding_per_axis
+    (
+    vsi_size_t   in_shape,
+    vsi_size_t   ksize,
+    uint32_t     stride,
+    uint32_t     dilation,
+    vsi_nn_pad_e pad_type,
+    vsi_size_t   out_pad[2]
+    )
+{
+    vsi_size_t out_size;
+    vsi_size_t total_pads;
+    if(dilation == 0)  dilation = 1;
+    out_size = vsi_nn_compute_filter_shape(pad_type, in_shape, ksize, stride, dilation);
+    total_pads = _compute_padding(in_shape, ksize, stride, dilation, out_size);
+
+    out_pad[0] = total_pads / 2;
+    out_pad[1] = total_pads - out_pad[0];
+}
+
 void vsi_nn_compute_padding
     (
     vsi_size_t   * in_shape,
@@ -390,8 +435,6 @@ void vsi_nn_compute_padding
     vsi_size_t   * out_pad
     )
 {
-    vsi_size_t out_w, out_h;
-    vsi_size_t pad_w, pad_h;
     uint32_t dilation_w, dilation_h;
     if (NULL == in_shape || NULL == ksize
         || NULL == stride || NULL == out_pad)
@@ -413,16 +456,48 @@ void vsi_nn_compute_padding
         dilation_h = dilation[1];
     }
 
-    out_w = vsi_nn_compute_filter_shape(pad_type, in_shape[0], ksize[0], stride[0], dilation_w);
-    out_h = vsi_nn_compute_filter_shape(pad_type, in_shape[1], ksize[1], stride[1], dilation_h);
-    pad_w = _compute_padding(in_shape[0], ksize[0], stride[0], dilation_w, out_w);
-    pad_h = _compute_padding(in_shape[1], ksize[1], stride[1], dilation_h, out_h);
-    out_pad[0] = pad_w / 2;
-    out_pad[1] = pad_w - out_pad[0];
-    out_pad[2] = pad_h / 2;
-    out_pad[3] = pad_h - out_pad[2];
+    vsi_nn_compute_padding_per_axis(in_shape[0], ksize[0], stride[0], dilation_w, pad_type, out_pad);
+    vsi_nn_compute_padding_per_axis(in_shape[1], ksize[1], stride[1], dilation_h, pad_type, out_pad + 2);
 } /* vsi_nn_compute_padding() */
 
+void vsi_nn_compute_padding_3d
+    (
+    const vsi_size_t   in_shape[3],
+    const vsi_size_t   ksize[3],
+    const uint32_t     stride[3],
+    const uint32_t     dilation[3],
+    const vsi_nn_pad_e pad_type,
+    vsi_size_t   out_pad[6]
+    )
+{
+    uint32_t dilation_w, dilation_h, dilation_d;
+    if (NULL == in_shape || NULL == ksize
+        || NULL == stride || NULL == out_pad)
+    {
+        return;
+    }
+    if (pad_type == VSI_NN_PAD_AUTO)
+    {
+        return;
+    }
+    if (NULL == dilation || (dilation[0] == 0 && dilation[1] == 0 && dilation[2] == 0))
+    {
+        dilation_w = 1;
+        dilation_h = 1;
+        dilation_d = 1;
+    }
+    else
+    {
+        dilation_w = dilation[0];
+        dilation_h = dilation[1];
+        dilation_d = dilation[2];
+    }
+
+    vsi_nn_compute_padding_per_axis(in_shape[0], ksize[0], stride[0], dilation_w, pad_type, out_pad);
+    vsi_nn_compute_padding_per_axis(in_shape[1], ksize[1], stride[1], dilation_h, pad_type, out_pad + 2);
+    vsi_nn_compute_padding_per_axis(in_shape[2], ksize[2], stride[2], dilation_d, pad_type, out_pad + 4);
+}
+
 void vsi_nn_ComputePadWithPadType
     (
     vsi_size_t   * in_shape,
@@ -792,10 +867,12 @@ void vsi_nn_FormatToString
 {
     switch(tensor->attr.dtype.vx_type)
     {
+    case VSI_NN_TYPE_INT4:strncpy(buf,  "i4 ",  buf_sz);break;
     case VSI_NN_TYPE_INT8:strncpy(buf,  "i8 ",  buf_sz);break;
     case VSI_NN_TYPE_INT16:strncpy(buf, "i16", buf_sz);break;
     case VSI_NN_TYPE_INT32:strncpy(buf, "i32", buf_sz);break;
     case VSI_NN_TYPE_INT64:strncpy(buf, "i64", buf_sz);break;
+    case VSI_NN_TYPE_UINT4:strncpy(buf,  "u4 ",  buf_sz);break;
     case VSI_NN_TYPE_UINT8:strncpy(buf,  "u8 ",  buf_sz);break;
     case VSI_NN_TYPE_UINT16:strncpy(buf, "u16", buf_sz);break;
     case VSI_NN_TYPE_UINT32:strncpy(buf, "u32", buf_sz);break;
@@ -1003,7 +1080,7 @@ vsi_bool vsi_nn_is_same_quant_type(
                 result = TRUE;
             }
             break;
-
+        case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
         case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
             if (src->attr.dtype.scale == dst->attr.dtype.scale &&
                 src->attr.dtype.zero_point == dst->attr.dtype.zero_point)
@@ -1050,3 +1127,220 @@ vsi_bool vsi_nn_is_same_type
 {
     return (vsi_nn_is_same_data_type(src, dst) && vsi_nn_is_same_quant_type(src, dst));
 }
+
+vsi_bool vsi_nn_is_broadcast_operaton
+    (
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            *  output
+    )
+{
+    vsi_size_t out_rank = output->attr.dim_num;
+    vsi_size_t i = 0;
+
+    for (i = 0; i < out_rank; i++)
+    {
+        size_t j = 0;
+        vsi_size_t dst_size = output->attr.size[i];
+
+        for (j = 0; j < input_num; j++)
+        {
+            vsi_size_t src_size = i < inputs[j]->attr.dim_num  ? inputs[j]->attr.size[i] : 1;
+
+            if (dst_size != src_size)
+            {
+                return TRUE;
+            }
+        }
+    }
+    return FALSE;
+}
+
+float vsi_nn_get_tensor_scale
+    (
+    vsi_nn_tensor_t * tensor
+    )
+{
+    float scale = 1.0f;
+
+    switch (tensor->attr.dtype.qnt_type)
+    {
+        case VSI_NN_QNT_TYPE_DFP:
+        {
+            int8_t fl = tensor->attr.dtype.fl;
+            if (fl >= 0)
+            {
+                scale = 1.0f / ( (float) ( (int64_t)1 << fl ));
+            }
+            else
+            {
+                scale = (float) ( (int64_t)1 << -fl );
+            }
+        }
+            break;
+        case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
+        case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
+            scale = tensor->attr.dtype.scale;
+            break;
+    default:
+        break;
+    }
+
+    return scale;
+}
+
+int32_t vsi_nn_get_tensor_zero_point
+    (
+    vsi_nn_tensor_t * tensor
+    )
+{
+    int32_t zero_point = 0;
+
+    switch (tensor->attr.dtype.qnt_type)
+    {
+        case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
+        case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
+            zero_point = tensor->attr.dtype.zero_point;
+            break;
+    default:
+        break;
+    }
+
+    return zero_point;
+}
+
+void vsi_nn_get_tensor_clamp_min_max
+    (
+    vsi_nn_tensor_t * input,
+    float *clampMin,
+    float *clampMax
+    )
+{
+    float zero_point = (float)vsi_nn_get_tensor_zero_point(input);
+    vsi_nn_type_e vx_type = input->attr.dtype.vx_type;
+
+    if (vx_type == VSI_NN_TYPE_UINT8)
+    {
+        *clampMin = - zero_point;
+        *clampMax = 255 - zero_point;
+    }
+    else if (vx_type == VSI_NN_TYPE_INT8)
+    {
+        *clampMin = -128 - zero_point;
+        *clampMax = 127 - zero_point;
+    }
+    else if (vx_type == VSI_NN_TYPE_INT16)
+    {
+        *clampMin = -32768 - zero_point;
+        *clampMax = 32767 - zero_point;
+    }
+    else if (vx_type == VSI_NN_TYPE_UINT16)
+    {
+        *clampMin = - zero_point;
+        *clampMax = 65535 - zero_point;
+    }
+    else
+    {
+        uint32_t f32_min = 0xff800000;
+        uint32_t f32_max = 0x7f800000;
+
+        *clampMin = *(float*)&f32_min;
+        *clampMax = *(float*)&f32_max;
+    }
+}
+
+vsi_status vsi_nn_Pack4bitData
+    (
+    vsi_nn_tensor_t * tensor,
+    uint8_t   * src,
+    uint8_t * dest
+    )
+{
+    vsi_status status;
+    uint32_t i = 0, j = 0;
+    uint8_t high = 0, low = 0;
+    vsi_size_t src_size;
+
+    status = VSI_SUCCESS;
+    src_size = vsi_nn_GetElementNum( tensor );
+    for( i = 0; i < src_size; i++ )
+    {
+        if( (i+1) % tensor->attr.size[0] == 0)
+        {
+            high = 0;
+            low = src[i];
+        }
+        else
+        {
+            high = src[i+1];
+            low = src[i];
+            i++;
+        }
+        dest[j] = (high << 4) | (low & 0xF);
+        j++;
+    }
+    return status;
+} /* vsi_nn_Pack4bitData() */
+
+vsi_status vsi_nn_Unpack4bitData
+    (
+    vsi_nn_tensor_t * tensor,
+    uint8_t   * src,
+    uint8_t * dest,
+    vsi_nn_type_e type
+    )
+{
+    vsi_status status;
+    uint32_t i = 0, j = 0;
+    uint8_t high = 0, low = 0;
+    vsi_size_t stride[VSI_NN_MAX_DIM_NUM] = {0};
+    vsi_size_t src_size;
+
+    status = VSI_SUCCESS;
+    src_size = vsi_nn_GetStrideSize(&tensor->attr, stride);
+    for( i = 0 ; i < src_size; i++)
+    {
+        high = src[i] >> 4;
+        low = src[i] & 0x0F;
+        if( type == VSI_NN_TYPE_INT4 )
+        {
+            if( high > 7)
+            {
+                high = high | 0xF0;
+            }
+            if( low > 7)
+            {
+                low = low | 0xF0;
+            }
+        }
+        if( tensor->attr.size[0] % stride[1] == 0 )
+        {
+            if( tensor->attr.size[0] == 1 )
+            {
+                dest[j] = low;
+                j++;
+            }
+            else
+            {
+                dest[j] = low;
+                dest[j+1] = high;
+                j += 2;
+            }
+        }
+        else
+        {
+            if( (i+1) % stride[1] == 0 )
+            {
+                dest[j] = low;
+                j++;
+            }
+            else
+            {
+                dest[j] = low;
+                dest[j+1] = high;
+                j += 2;
+            }
+        }
+    }
+    return status;
+} /* vsi_nn_Unpack4bitData() */
diff --git a/src/tim/vx/internal/src/vsi_nn_context.c b/src/tim/vx/internal/src/vsi_nn_context.c
index eb7f494..f453b32 100644
--- a/src/tim/vx/internal/src/vsi_nn_context.c
+++ b/src/tim/vx/internal/src/vsi_nn_context.c
@@ -34,7 +34,9 @@ static vsi_status query_hardware_caps
 {
     vsi_status status = VSI_FAILURE;
     vx_hardware_caps_params_t param;
-
+#if VX_STREAM_PROCESSOR_SUPPORT
+    vx_hardware_caps_params_ext2_t paramExt2;
+#endif
 #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
     vx_hardware_caps_params_ext_t paramExt;
 
@@ -51,9 +53,16 @@ static vsi_status query_hardware_caps
 
 #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
     context->config.subGroupSize = paramExt.subGroupSize;
-#if VX_VA40_EXT_SUPPORT
+#ifdef VSI_40BIT_VA_SUPPORT
     context->config.use_40bits_va = paramExt.supportVA40;
 #endif
+#if VX_STREAM_PROCESSOR_SUPPORT
+    memset(&paramExt2, 0, sizeof(vx_hardware_caps_params_ext2_t));
+    status = vxQueryHardwareCaps(context->c, (vx_hardware_caps_params_t*)(&paramExt2),
+                sizeof(vx_hardware_caps_params_ext2_t));
+    context->config.support_stream_processor = paramExt.supportStreamProcessor;
+    context->config.sp_exec_count = paramExt2.streamProcessorExecCount;
+#endif
 
 #endif
 
diff --git a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
index c1551a6..fb17d8b 100644
--- a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
@@ -521,10 +521,27 @@ static vx_tensor _create_const_raw_tensor
     vx_tensor_create_params_t params;
     float * scales = NULL;
     int32_t * zeroPoints = NULL;
+    vx_size size_vxsize[VSI_NN_MAX_DIM_NUM] = {0};
+    vx_uint32 size_u32[VSI_NN_MAX_DIM_NUM] = {0};
+    size_t i = 0;
 
     memset( &params, 0, sizeof( vx_tensor_create_params_t ) );
     params.num_of_dims = attr.dim_num;
-    params.sizes = attr.size;
+    for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+    {
+        size_vxsize[i] = -1 == attr.size[i] ? -1 : (vx_size)attr.size[i];
+    }
+    for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+    {
+        size_u32[i] = -1 == attr.size[i] ? -1 : (vx_uint32)attr.size[i];
+    }
+#ifdef VSI_40BIT_VA_SUPPORT
+    params.sizes = size_vxsize;
+    (void)size_u32;
+#else
+    params.sizes = size_u32;
+    (void)size_vxsize;
+#endif
     params.data_format = (vsi_enum)attr.dtype.vx_type;
     params.quant_format = (vsi_enum)attr.dtype.qnt_type;
     switch( attr.dtype.qnt_type )
@@ -593,20 +610,31 @@ static vx_tensor _create_const_raw_tensor
             if( data )
             {
 #ifdef VSI_40BIT_VA_SUPPORT
-                addr = vxCreateTensorAddressing(graph->ctx->c,
-                    attr.size, stride_size, (vsi_size_t)attr.dim_num);
+                {
+                    vx_size size[_cnt_of_array(attr.size)] = {0};
+                    vx_size stride_size_vxsize[_cnt_of_array(stride_size)] = {0};
+                    for(i = 0; i < _cnt_of_array(attr.size); i++)
+                    {
+                        size[i] = -1 == attr.size[i] ? -1 : (vx_size)attr.size[i];
+                    }
+                    for(i = 0; i < _cnt_of_array(stride_size); i++)
+                    {
+                        stride_size[i] = (vx_size)stride_size[i];
+                    }
+                    addr = vxCreateTensorAddressing(graph->ctx->c,
+                        size, stride_size_vxsize, (vx_size)attr.dim_num);
+                }
 #else
                 {
-                    vsi_size_t i;
                     uint32_t size_32bit[_cnt_of_array(attr.size)] = {0};
                     uint32_t stride_size_32bit[_cnt_of_array(stride_size)] = {0};
                     for(i = 0; i < _cnt_of_array(attr.size); i++)
                     {
-                        size_32bit[i] = (uint32_t)attr.size[i];
+                        size_32bit[i] = -1 == attr.size[i] ? -1 : (uint32_t)attr.size[i];
                     }
                     for(i = 0; i < _cnt_of_array(stride_size); i++)
                     {
-                        stride_size_32bit[i] = (uint32_t)stride_size[i];
+                        stride_size_32bit[i] = -1 == stride_size[i] ? -1 : (uint32_t)stride_size[i];
                     }
                     addr = vxCreateTensorAddressing(graph->ctx->c,
                         size_32bit, stride_size_32bit, (vx_uint8)attr.dim_num);
diff --git a/src/tim/vx/internal/src/vsi_nn_internal_node.c b/src/tim/vx/internal/src/vsi_nn_internal_node.c
index 50b9d62..4962dbc 100644
--- a/src/tim/vx/internal/src/vsi_nn_internal_node.c
+++ b/src/tim/vx/internal/src/vsi_nn_internal_node.c
@@ -213,6 +213,7 @@ vsi_nn_internal_tensor_t* vsi_nn_internal_create_zero_bias_tensor
 
     switch(input_attr->dtype.qnt_type)
     {
+        case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
         case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
             scale = input_attr->dtype.scale;
             break;
@@ -233,10 +234,11 @@ vsi_nn_internal_tensor_t* vsi_nn_internal_create_zero_bias_tensor
 
     switch(weight_attr->dtype.qnt_type)
     {
+        case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
         case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
             attr.dtype.scale = weight_attr->dtype.scale * scale;
             attr.dtype.zero_point = 0;
-            attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC;
+            attr.dtype.qnt_type = weight_attr->dtype.qnt_type;
             break;
 
         case VSI_NN_QNT_TYPE_DFP:
diff --git a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
index fe05e1e..06dd052 100644
--- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
+++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
@@ -118,8 +118,7 @@ static void _set_preproc_node_rect_params
     (
     vsi_nn_node_t* node,
     vsi_nn_preprocess_crop_t* crop,
-    vsi_nn_tensor_attr_t* attr,
-    vsi_nn_preprocess_source_layout_e* source_layout
+    vsi_nn_preprocess_image_size_t* input_size
     )
 {
     if(crop != NULL)
@@ -133,13 +132,8 @@ static void _set_preproc_node_rect_params
     {
         node->nn_param.pre_process.rect.left = 0;
         node->nn_param.pre_process.rect.top = 0;
-        node->nn_param.pre_process.rect.width = (uint32_t)attr->size[0];
-        node->nn_param.pre_process.rect.height = (uint32_t)attr->size[1];
-        if(*source_layout == VSI_NN_SOURCE_LAYOUT_NHWC)
-        {
-            node->nn_param.pre_process.rect.width = (uint32_t)attr->size[1];
-            node->nn_param.pre_process.rect.height = (uint32_t)attr->size[2];
-        }
+        node->nn_param.pre_process.rect.width = input_size->w;
+        node->nn_param.pre_process.rect.height = input_size->h;
     }
 } /* _set_preproc_node_rect_params() */
 
@@ -496,7 +490,7 @@ vsi_status vsi_nn_add_single_preproc_node
     status = _set_preproc_node_type(node, source_format);
     TEST_CHECK_STATUS(status, final);
 
-    _set_preproc_node_rect_params(node, crop, &org_norm_tensor->attr, source_layout);
+    _set_preproc_node_rect_params(node, crop, input_size);
     _set_preproc_node_norm_params(node, mean_and_scale, &org_norm_tensor->attr);
 
     if(permute != NULL)
diff --git a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c
index 9ba72ad..8fa073c 100644
--- a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c
+++ b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c
@@ -84,7 +84,6 @@ vsi_bool vsi_nn_rnn_find_best_kernel_size
             }
             kernel_w = 1;
         }
-
     }
 
     VSILOGD("Use kernel_h: %d, kernel_w: %d to convert FC", kernel_h, kernel_w);
@@ -122,7 +121,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_input_for_nn_fc
     vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor);
     tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
 
-    tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE, 0, 0 );
+    tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE2, 0, 0 );
     reshape_in_size = (vsi_size_t*)vsi_nn_internal_new_node_param(tmp_inode, 4 * sizeof(vsi_size_t));
 
     reshape_in_size[3] = input->attr.size[1];
@@ -130,8 +129,8 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_input_for_nn_fc
     reshape_in_size[1] = kernel_h;
     reshape_in_size[0] = kernel_w;
 
-    tmp_inode->node->nn_param.reshape.size = reshape_in_size;
-    tmp_inode->node->nn_param.reshape.dim_num = 4;
+    tmp_inode->node->nn_param.reshape2.size = reshape_in_size;
+    tmp_inode->node->nn_param.reshape2.dim_num = 4;
     tmp_inode->inputs[0] = input;
     tmp_inode->outputs[0] = tensor1->t;
     vsi_nn_internal_setup_node(self, tmp_inode);
@@ -231,14 +230,14 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_output_for_nn_fc
     }
 
     tensor2 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
-    tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE, 0, 0 );
+    tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE2, 0, 0 );
     reshape_in_size = (vsi_size_t *)vsi_nn_internal_new_node_param(tmp_inode, 4 * sizeof(vsi_size_t));
 
     reshape_in_size[1] = tensor->attr.size[3];
     reshape_in_size[0] = tensor->attr.size[2];
 
-    tmp_inode->node->nn_param.reshape.size = reshape_in_size;
-    tmp_inode->node->nn_param.reshape.dim_num = 2;
+    tmp_inode->node->nn_param.reshape2.size = reshape_in_size;
+    tmp_inode->node->nn_param.reshape2.dim_num = 2;
     tmp_inode->inputs[0] = tensor;
     tmp_inode->outputs[0] = tensor2->t;
     vsi_nn_internal_setup_node(self, tmp_inode);
@@ -303,14 +302,14 @@ vsi_bool vsi_nn_rnn_process_output_for_nn_fc2
         tensor = tensor0->t;
     }
 
-    tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE, 0, 0 );
+    tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE2, 0, 0 );
     reshape_in_size = (vsi_size_t*)vsi_nn_internal_new_node_param(tmp_inode, 4 * sizeof(vsi_size_t));
 
     reshape_in_size[1] = tensor->attr.size[3];
     reshape_in_size[0] = tensor->attr.size[2];
 
-    tmp_inode->node->nn_param.reshape.size = reshape_in_size;
-    tmp_inode->node->nn_param.reshape.dim_num = 2;
+    tmp_inode->node->nn_param.reshape2.size = reshape_in_size;
+    tmp_inode->node->nn_param.reshape2.dim_num = 2;
     tmp_inode->inputs[0] = tensor;
     tmp_inode->outputs[0] = output;
     vsi_nn_internal_setup_node(self, tmp_inode);
@@ -694,7 +693,7 @@ void vsi_nn_rnn_data_check_aligned
         vsi_size_t tensor_size = vsi_nn_GetTensorSize( input[i]->attr.size,
             input[i]->attr.dim_num, input[i]->attr.dtype.vx_type );
 
-        if( ofst & 0x3f )
+        if( ofst & 0x3f && !self->graph->ctx->config.support_stream_processor)
         {
             vsi_nn_internal_init_tensor_attr(&attr, &input[i]->attr.dtype, use_virtual_tensor);
             output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
@@ -729,14 +728,14 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_split_output
     vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor);
     output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
 
-    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 );
+    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
     reshape_split_size = (vsi_size_t *)vsi_nn_internal_new_node_param(curr,
         VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
     reshape_split_size[0] = -1;
     reshape_split_size[1] = batch_size;
 
-    curr->node->nn_param.reshape.size = reshape_split_size;
-    curr->node->nn_param.reshape.dim_num = 2;
+    curr->node->nn_param.reshape2.size = reshape_split_size;
+    curr->node->nn_param.reshape2.dim_num = 2;
     curr->inputs[0] = input;
     curr->outputs[0] = output_tensor->t;
     vsi_nn_internal_setup_node( self, curr );
@@ -763,15 +762,15 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_cell_output
     vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, use_virtual_tensor);
     output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
 
-    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 );
+    curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE2, 0, 0 );
     reshape_grucell_output_size = (vsi_size_t*)vsi_nn_internal_new_node_param(curr,
         VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t));
     reshape_grucell_output_size[0] = -1;
     reshape_grucell_output_size[1] = batch_size;
     reshape_grucell_output_size[2] = 1;
 
-    curr->node->nn_param.reshape.size = reshape_grucell_output_size;
-    curr->node->nn_param.reshape.dim_num = 3;
+    curr->node->nn_param.reshape2.size = reshape_grucell_output_size;
+    curr->node->nn_param.reshape2.dim_num = 3;
     curr->inputs[0] = input;
     curr->outputs[0] = output_tensor->t;
     vsi_nn_internal_setup_node( self, curr );
@@ -918,16 +917,15 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_reshape
     vsi_bool use_virtual_tensor
     )
 {
-
     vsi_nn_internal_node_t* curr = NULL;
     vsi_nn_internal_tensor_t* tensor0 = NULL;
     vsi_size_t* reshape_in_size = NULL;
 
-    curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE, 0, 0 );
+    curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE2, 0, 0 );
     reshape_in_size = (vsi_size_t*)vsi_nn_internal_new_node_param(curr, dim_num * sizeof(vsi_size_t));
     memcpy(reshape_in_size, size, dim_num * sizeof(vsi_size_t));
-    curr->node->nn_param.reshape.size = reshape_in_size;
-    curr->node->nn_param.reshape.dim_num = (uint32_t)dim_num;
+    curr->node->nn_param.reshape2.size = reshape_in_size;
+    curr->node->nn_param.reshape2.dim_num = (uint32_t)dim_num;
     curr->inputs[0] = input_tensor;
     curr->outputs[0] = output_tensor;
 
diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c
index 3f662b6..e82e537 100644
--- a/src/tim/vx/internal/src/vsi_nn_tensor.c
+++ b/src/tim/vx/internal/src/vsi_nn_tensor.c
@@ -25,6 +25,7 @@
 #include <string.h>
 #include <stdarg.h>
 
+#include "vsi_nn_platform.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_graph.h"
@@ -74,11 +75,11 @@ static vsi_size_t get_tensor_elements_num
 {
     vsi_size_t num;
     vsi_size_t sz;
-    uint32_t dsize;
+    vsi_size_t dsize;
 
     sz = vsi_nn_GetTensorSize( shape,
         dim_num, type );
-    dsize = vsi_nn_GetTypeBytes( type );
+    dsize = vsi_nn_TypeGetBytesExt( type );
     num = sz / dsize;
     return num;
 } /* get_tensor_elements_num() */
@@ -128,6 +129,14 @@ static void print_tensor
             tensor->attr.dtype.channel_dim, tensor->attr.dtype.scale_dim );
         ext_attr[count] = 0;
         break;
+    case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC:
+        count = snprintf(&ext_attr[0],
+                         _EXT_ATTR_BUF_SZ,
+                         "ASYM PERCHANNEL axis=%d, count=%d",
+                         tensor->attr.dtype.channel_dim,
+                         tensor->attr.dtype.scale_dim);
+        ext_attr[count] = 0;
+        break;
 #endif
     default:
         strncpy(ext_attr, "NONE", _EXT_ATTR_BUF_SZ);
@@ -308,25 +317,50 @@ static vsi_bool _init_tensor
     vsi_bool ret;
     vx_tensor_create_params_t params;
     float * scales = NULL;
+    int32_t * zeroPoints = NULL;
     int32_t * null_zp = NULL;
+    vx_size size_vxsize[VSI_NN_MAX_DIM_NUM] = {0};
+    vx_uint32 size_u32[VSI_NN_MAX_DIM_NUM] = {0};
+    size_t i = 0;
     ret = TRUE;
 
     memset( &params, 0, sizeof( vx_tensor_create_params_t ) );
     params.num_of_dims = tensor->attr.dim_num;
-    params.sizes = tensor->attr.size;
+    for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+    {
+        size_vxsize[i] = -1 == tensor->attr.size[i] ? -1 : (vx_size)tensor->attr.size[i];
+    }
+    for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+    {
+        size_u32[i] = -1 == tensor->attr.size[i] ? -1 : (vx_uint32)tensor->attr.size[i];
+    }
+#ifdef VSI_40BIT_VA_SUPPORT
+    params.sizes = size_vxsize;
+    (void)size_u32;
+#else
+    params.sizes = size_u32;
+    (void)size_vxsize;
+#endif
     params.data_format = (vsi_enum)tensor->attr.dtype.vx_type;
-    params.quant_format = (vsi_enum)tensor->attr.dtype.qnt_type;
     switch( tensor->attr.dtype.qnt_type )
     {
     case VSI_NN_QNT_TYPE_DFP:
+        params.quant_format = (vsi_enum)VX_QUANT_DYNAMIC_FIXED_POINT;
         params.quant_data.dfp.fixed_point_pos = (uint8_t)tensor->attr.dtype.fl;
         break;
+    case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
     case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
+        params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE;
         params.quant_data.affine.scale = tensor->attr.dtype.scale;
         params.quant_data.affine.zeroPoint = (int32_t)tensor->attr.dtype.zero_point;
         break;
     case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC:
 #ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT
+        #ifdef VX_QUANT_AFFINE_SCALE_PER_CHANNEL
+            params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE_PER_CHANNEL;
+        #else
+            params.quant_format = (vsi_enum)VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC;
+        #endif
         // This is a hack that driver doesn't support const scales
         scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.scale_dim);
         memcpy(scales, tensor->attr.dtype.scales, tensor->attr.dtype.scale_dim * sizeof(float));
@@ -345,6 +379,35 @@ static vsi_bool _init_tensor
         break;
 #else
     VSILOGE( "can't support qnt_type VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC." );
+#endif
+    case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC:
+#ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT
+        #ifdef VX_QUANT_AFFINE_SCALE_PER_CHANNEL
+            params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE_PER_CHANNEL;
+        #else
+            params.quant_format = (vsi_enum)VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC;
+        #endif
+        // This is a hack that driver doesn't support const scales
+        scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.scale_dim);
+        memcpy(scales,
+               tensor->attr.dtype.scales,
+               tensor->attr.dtype.scale_dim * sizeof(float));
+        zeroPoints = (int32_t*)malloc(sizeof(int32_t) * tensor->attr.dtype.zero_points_dim);
+        memcpy(zeroPoints,
+               tensor->attr.dtype.zero_points,
+               tensor->attr.dtype.zero_points_dim * sizeof(int32_t));
+        params.quant_data.affinePerChannel.channelDim =
+            tensor->attr.dtype.channel_dim;
+        params.quant_data.affinePerChannel.scaleCount =
+            tensor->attr.dtype.scale_dim;
+        params.quant_data.affinePerChannel.scales = scales;
+        params.quant_data.affinePerChannel.zeroPoint = zeroPoints;
+        params.quant_data.affinePerChannel.zeroPointCount = tensor->attr.dtype.zero_points_dim;
+        break;
+#else
+        VSILOGE(
+            "can't support qnt_type "
+            "VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC.");
 #endif
     default:
         break;
@@ -359,6 +422,13 @@ static vsi_bool _init_tensor
         vxReleaseWeightsBiasesParameter( &tensor->wb );
     }
 
+#if VX_STREAM_PROCESSOR_SUPPORT
+    if ( TRUE == tensor->attr.is_dummy )
+    {
+        tensor->t = vxCreateDummyTensor( graph->ctx->c,
+            (vsi_size_t)tensor->attr.dim_num, tensor->attr.size, (vsi_enum)tensor->attr.dtype.vx_type );
+    } else
+#endif
     if( TRUE == tensor->attr.is_created_from_handle )
     {
         vx_tensor_addressing addr;
@@ -389,6 +459,10 @@ static vsi_bool _init_tensor
                     {
                         free(scales);
                     }
+                    if( zeroPoints )
+                    {
+                        free(zeroPoints);
+                    }
                     if(null_zp)
                     {
                         free(null_zp);
@@ -400,19 +474,31 @@ static vsi_bool _init_tensor
             if( data )
             {
 #ifdef VSI_40BIT_VA_SUPPORT
-                addr = vxCreateTensorAddressing(graph->ctx->c,
-                    tensor->attr.size, stride_size, (vsi_size_t)tensor->attr.dim_num);
-#else
                 {
-                    uint32_t i, size_32bit[_cnt_of_array(tensor->attr.size)] = {0};
-                    uint32_t stride_size_32bit[_cnt_of_array(stride_size)] = {0};
+                    vx_size size_vxsize[_cnt_of_array(tensor->attr.size)] = {0};
+                    vx_size stride_size_vxsize[_cnt_of_array(stride_size)] = {0};
                     for(i = 0; i < _cnt_of_array(tensor->attr.size); i++)
                     {
-                        size_32bit[i] = (uint32_t)tensor->attr.size[i];
+                        size_vxsize[i] = -1 == tensor->attr.size[i] ? -1 : (vx_size)tensor->attr.size[i];
                     }
                     for(i = 0; i < _cnt_of_array(stride_size); i++)
                     {
-                        stride_size_32bit[i] = (uint32_t)stride_size[i];
+                        stride_size_vxsize[i] = -1 == stride_size[i] ? -1 : (vx_size)stride_size[i];
+                    }
+                    addr = vxCreateTensorAddressing(graph->ctx->c,
+                        size_vxsize, stride_size_vxsize, (vx_size)tensor->attr.dim_num);
+                }
+#else
+                {
+                    uint32_t size_32bit[_cnt_of_array(tensor->attr.size)] = {0};
+                    uint32_t stride_size_32bit[_cnt_of_array(stride_size)] = {0};
+                    for(i = 0; i < _cnt_of_array(tensor->attr.size); i++)
+                    {
+                        size_32bit[i] = -1 == tensor->attr.size[i] ? -1 : (uint32_t)tensor->attr.size[i];
+                    }
+                    for(i = 0; i < _cnt_of_array(stride_size); i++)
+                    {
+                        stride_size_32bit[i] = -1 == stride_size[i] ? -1 : (uint32_t)stride_size[i];
                     }
                     addr = vxCreateTensorAddressing(graph->ctx->c,
                         size_32bit, stride_size_32bit, (uint8_t)tensor->attr.dim_num);
@@ -481,6 +567,10 @@ static vsi_bool _init_tensor
     {
         free(scales);
     }
+    if (zeroPoints)
+    {
+        free(zeroPoints);
+    }
     if(null_zp)
     {
         free(null_zp);
@@ -588,15 +678,23 @@ vsi_nn_tensor_t * vsi_nn_CreateTensorWithDefault
         uint8_t* data = NULL;
 
         size = vsi_nn_GetStrideSize( &t->attr, stride );
+        if( stride[0] == 0 )
+        {
+            size = vsi_nn_GetElementNum(t);
+        }
         data = (uint8_t *)malloc( size );
         if( data )
         {
             vsi_size_t i = 0, j = 0;
-            vsi_size_t elements = size / stride[0];
+            vsi_size_t elements = 0;
             vsi_status status = VSI_FAILURE;
 
+            if(stride[0] != 0)
+            {
+                elements = size / stride[0];
+            }
             status = vsi_nn_Float32ToDtype( defualt_value, &data[0], &t->attr.dtype );
-            if(stride[0] == 1)
+            if(stride[0] == 1 || stride[0] == 0)
             {
                  memset(data, data[0], size);
             }
@@ -639,14 +737,22 @@ vsi_status vsi_nn_FillTensorWithValue
         uint8_t* data = NULL;
 
         size = vsi_nn_GetStrideSize( &tensor->attr, stride );
+        if( stride[0] == 0)
+        {
+            size = vsi_nn_GetElementNum(tensor);
+        }
         data = (uint8_t *)malloc( size );
         if( data )
         {
             vsi_size_t i = 0, j = 0;
-            vsi_size_t elements = size / stride[0];
+            vsi_size_t elements = 0;
+            if(stride[0] != 0)
+            {
+                elements = size / stride[0];
+            }
             status = vsi_nn_Float32ToDtype( value, &data[0], &tensor->attr.dtype );
 
-            if(stride[0] == 1)
+            if(stride[0] == 1 || stride[0] == 0)
             {
                  memset(data, data[0], size);
             }
@@ -826,7 +932,7 @@ float * vsi_nn_ConvertTensorToFloat32Data
     uint8_t *tensor_data = NULL;
     vsi_size_t elements;
     vsi_size_t i;
-    uint32_t stride;
+    vsi_size_t stride;
     float *data;
 
     if(NULL == graph || NULL == tensor)
@@ -835,7 +941,7 @@ float * vsi_nn_ConvertTensorToFloat32Data
     }
 
     elements = vsi_nn_GetElementNum(tensor);
-    stride = vsi_nn_TypeGetBytes(tensor->attr.dtype.vx_type);
+    stride = vsi_nn_TypeGetBytesExt(tensor->attr.dtype.vx_type);
 
     data = NULL;
     data = (float *)malloc(elements * sizeof(float));
@@ -883,6 +989,7 @@ uint8_t * vsi_nn_ConvertTensorToData
     )
 {
     uint8_t    * data;
+    uint8_t    * new_data;
     vsi_size_t     buf_sz;
     vsi_size_t     stride_size[VSI_NN_MAX_DIM_NUM];
     vsi_status     status;
@@ -929,8 +1036,23 @@ uint8_t * vsi_nn_ConvertTensorToData
             data = NULL;
         }
     }
-    return data;
-
+    if(tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT4 ||
+        tensor->attr.dtype.vx_type == VSI_NN_TYPE_UINT4)
+    {
+        vsi_size_t dest_size = vsi_nn_GetElementNum(tensor);
+        new_data = (uint8_t*)malloc(dest_size);
+        status = vsi_nn_Unpack4bitData(tensor, data, new_data, tensor->attr.dtype.vx_type);
+        if(data)
+        {
+            free(data);
+            data = NULL;
+        }
+        return new_data;
+    }
+    else
+    {
+        return data;
+    }
 } /* vsi_nn_ConvertTensorToData() */
 
 /*
@@ -1032,6 +1154,7 @@ uint8_t * vsi_nn_ConvertRawTensorToData2
         status = vxQueryTensor(tensor, VX_TENSOR_FIXED_POINT_POS,
             &(attr->dtype.fl), sizeof(int8_t));
         break;
+    case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
     case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
         status = vxQueryTensor(tensor, VX_TENSOR_ZERO_POINT,
             &(attr->dtype.zero_point), sizeof(int32_t));
@@ -1077,7 +1200,7 @@ void vsi_nn_SaveTensorToTextByFp32
     const float   c_flush_th = 0.7f;
     uint8_t    * data;
     uint8_t    * ptr;
-    uint32_t     type_bytes;
+    vsi_size_t     stride;
     uint8_t      buf[_TENSOR_TMPBUF_SZ];
     FILE        * fp;
     float    write_data;
@@ -1108,14 +1231,13 @@ void vsi_nn_SaveTensorToTextByFp32
         return;
     }
     sz = vsi_nn_GetElementNum( tensor );
-
     ptr = data;
-    type_bytes = vsi_nn_TypeGetBytes( tensor->attr.dtype.vx_type );
+    stride = vsi_nn_TypeGetBytesExt( tensor->attr.dtype.vx_type );
     count = 0;
     for( i = 0; i < sz; i ++ )
     {
         vsi_nn_DtypeToFloat32( ptr, &write_data, &tensor->attr.dtype );
-        ptr += type_bytes;
+        ptr += stride;
 
         count += snprintf( (char *)&buf[count], _TENSOR_TMPBUF_SZ - count,
             "%f%s", write_data, seperator );
@@ -1173,7 +1295,7 @@ void vsi_nn_SaveDataToText
     uint8_t      buf[_TENSOR_TMPBUF_SZ];
     FILE        * fp;
     float    write_data;
-    uint32_t     type_bytes;
+    vsi_size_t     stride;
     vsi_size_t     i;
     uint32_t     count;
 
@@ -1197,14 +1319,15 @@ void vsi_nn_SaveDataToText
         VSILOGW( "Write file %s fail. Please check...", filename );
         return;
     }
-    type_bytes = vsi_nn_GetTypeBytes( type );
+    stride = vsi_nn_TypeGetBytesExt( type );
 
     count = 0;
     for( i = 0; i < data_size; i ++ )
     {
-        write_data = vsi_nn_DataAsFloat32( &data[type_bytes * i],
+        write_data = vsi_nn_DataAsFloat32( &data[stride * i],
             type );
-        if( type == VSI_NN_TYPE_UINT8 || type == VSI_NN_TYPE_INT8 )
+        if( type == VSI_NN_TYPE_UINT8 || type == VSI_NN_TYPE_INT8 ||
+            type == VSI_NN_TYPE_UINT4 || type == VSI_NN_TYPE_INT4 )
         {
             count += snprintf( (char *)&buf[count], _TENSOR_TMPBUF_SZ - count,
                 "%d%s", (int32_t)write_data, seperator );
@@ -1285,7 +1408,6 @@ vsi_nn_tensor_t * vsi_nn_CreateTensorFromData
     tensor = vsi_nn_CreateTensor( graph, attr );
 
     status = vsi_nn_CopyDataToTensor( graph, tensor, data );
-
     if( VSI_SUCCESS != status )
     {
         VSILOGE("Create tensor from data fail.");
@@ -1326,11 +1448,31 @@ vsi_status vsi_nn_CopyDataToTensor
     }
     else
     {
-        status = vsi_nn_copy_tensor_patch(tensor->t, &tensor->attr, data, VX_WRITE_ONLY);
+        if( tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT4 ||
+            tensor->attr.dtype.vx_type == VSI_NN_TYPE_UINT4 )
+        {
+            uint8_t* new_data = NULL;
+            vsi_size_t dest_size = vsi_nn_GetTensorSize( tensor->attr.size, tensor->attr.dim_num,
+                                                         tensor->attr.dtype.vx_type);
+            new_data = (uint8_t*)malloc( dest_size );
+            status = vsi_nn_Pack4bitData(tensor, (uint8_t*)data, new_data);
+            status = vsi_nn_copy_tensor_patch( tensor->t, &tensor->attr, new_data, VX_WRITE_ONLY );
+            if( new_data )
+            {
+                free( new_data );
+                new_data = NULL;
+            }
+        }
+        else
+        {
+            status = vsi_nn_copy_tensor_patch( tensor->t, &tensor->attr, data, VX_WRITE_ONLY );
+        }
     }
+
     return status;
 } /* vsi_nn_CopyDataToTensor() */
 
+
 vsi_status vsi_nn_FlushHandle
     (
     const vsi_nn_tensor_t * tensor
@@ -1515,18 +1657,7 @@ vsi_bool vsi_nn_ReshapeTensor
     }
 
     /* Create reshape tensor */
-#ifdef VSI_40BIT_VA_SUPPORT
-    output->t = vxReshapeTensor( input->t, new_shape, dim_num );
-#else
-    {
-        uint32_t i, new_shape_32bit[VSI_NN_MAX_DIM_NUM] = {0};
-        for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
-        {
-            new_shape_32bit[i] = (uint32_t)new_shape[i];
-        }
-        output->t = vxReshapeTensor( input->t, (int32_t *)new_shape_32bit, (uint32_t)dim_num );
-    }
-#endif
+    output->t = vsi_nn_safe_reshape_tensor( input->t, (void*)new_shape, (vsi_size_t)dim_num, sizeof(new_shape[0]) );
     if( NULL == output->t )
     {
         ret = FALSE;
@@ -1596,6 +1727,55 @@ void vsi_nn_TransposeTensor
     free( dst );
 } /* vsi_nn_TransposeTensor() */
 
+vx_tensor vsi_nn_safe_reshape_tensor
+    (
+    vx_tensor         tensor,
+    void            * num_of_dims,
+    vsi_size_t        sizes,
+    vsi_size_t        size_of_shape_element
+    )
+{
+    if(sizeof(vx_size) == size_of_shape_element)
+    {
+        vx_size* num_of_dims_vxsize = (vx_size*)num_of_dims;
+        #ifdef VSI_40BIT_VA_SUPPORT
+            return vxReshapeTensor( tensor, num_of_dims_vxsize, (vx_size)sizes );
+        #else
+            {
+                int32_t new_shape_int32[VSI_NN_MAX_DIM_NUM] = { 0 };
+                vsi_size_t i = 0;
+                for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+                {
+                    new_shape_int32[i] = -1 == num_of_dims_vxsize[i] ? -1 : (int32_t)num_of_dims_vxsize[i];
+                }
+                return vxReshapeTensor( tensor, new_shape_int32, (uint32_t)sizes );
+            }
+        #endif
+    }
+    else if(sizeof(int32_t) == size_of_shape_element)
+    {
+        int32_t* num_of_dims_int32 = (int32_t*)num_of_dims;
+        #ifdef VSI_40BIT_VA_SUPPORT
+            {
+                vx_size new_shape_vxsize[VSI_NN_MAX_DIM_NUM] = { 0 };
+                vsi_size_t i = 0;
+                for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+                {
+                    new_shape_vxsize[i] = -1 == num_of_dims_int32[i] ? -1 : (vx_size)num_of_dims_int32[i];
+                }
+                return vxReshapeTensor( tensor, new_shape_vxsize, (vx_size)sizes );
+            }
+        #else
+            return vxReshapeTensor( tensor, num_of_dims_int32, (uint32_t)sizes );
+        #endif
+    }
+    else
+    {
+        VSILOGE("couldn't handle tensor shape element with length of %"VSI_SIZE_T_SPECIFIER"", size_of_shape_element);
+        return NULL;
+    }
+} /* vsi_nn_safe_reshape_tensor() */
+
 void vsi_nn_PermuteTensor
     (
     vsi_nn_graph_t  * graph,
@@ -1649,11 +1829,8 @@ void vsi_nn_PermuteTensor
     }
     vsi_nn_Permute( dst, buf, shape_ptr, dim_num, perm, tensor->attr.dtype.vx_type );
     memcpy(tensor->attr.size, dst_shape, sizeof(dst_shape));
-#ifdef VSI_40BIT_VA_SUPPORT
-    tensor->t = vxReshapeTensor(tensor->t, tensor->attr.size, tensor->attr.dim_num);
-#else
-    tensor->t = vxReshapeTensor(tensor->t, (int32_t*)tensor->attr.size, tensor->attr.dim_num);
-#endif
+    tensor->t = vsi_nn_safe_reshape_tensor(tensor->t, (void*)tensor->attr.size,
+        (vsi_size_t)tensor->attr.dim_num, sizeof(tensor->attr.size[0]));
     status = vsi_nn_CopyDataToTensor( graph, tensor, dst );
     if( VSI_SUCCESS != status )
     {
@@ -1674,8 +1851,7 @@ vsi_size_t vsi_nn_GetElementNum
         return 0;
     }
 
-    return get_tensor_elements_num(tensor->attr.size,
-        tensor->attr.dim_num, tensor->attr.dtype.vx_type);
+    return vsi_nn_ShapeProduct((vsi_size_t*)tensor->attr.size, tensor->attr.dim_num);
 } /* vsi_nn_GetElementNum() */
 
 vsi_size_t vsi_nn_GetTensorSize
@@ -1687,17 +1863,32 @@ vsi_size_t vsi_nn_GetTensorSize
 {
     vsi_size_t sz;
     vsi_size_t i;
+    vsi_size_t bits_num;
     sz = 0;
     if( NULL == shape || 0 == dim_num )
     {
         return sz;
     }
-    sz = 1;
-    for( i = 0; i < dim_num; i ++ )
+    bits_num = vsi_nn_TypeGetBits( type );
+    if( bits_num < BITS_PER_BYTE )
+    {
+        if(shape[0] % 2 == 0)
+        {
+            sz = shape[0] / 2;
+        }
+        else
+        {
+            sz = shape[0] / 2 + shape[0] % 2;
+        }
+    }
+    else
+    {
+        sz = shape[0] * bits_num / BITS_PER_BYTE;
+    }
+    for( i = 1; i < dim_num; i ++ )
     {
         sz *= shape[i];
     }
-    sz *= vsi_nn_GetTypeBytes( type );
     return sz;
 } /* vsi_nn_GetTensorSize() */
 
@@ -2040,6 +2231,7 @@ vsi_status vsi_nn_vxGetTensorAttr
             &(attr->dtype.fl), sizeof(int8_t));
         TEST_CHECK_STATUS( status, final );
         break;
+    case VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC:
     case VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC:
         status = vxQueryTensor(tensor, VX_TENSOR_ZERO_POINT,
             &(attr->dtype.zero_point), sizeof(int32_t));
@@ -2154,7 +2346,36 @@ vsi_status vsi_nn_copy_tensor_veiw_patch
     }
 
 #ifdef USE_OPENVX_1_2
+
+#ifdef VX_TENSOR_STRIDE_X_BITS_SUPPORT
+    {
+        vx_trensor_addressing addr = NULL;
+        vx_size dim_sizes[VSI_NN_MAX_DIM_NUM], strides[VSI_NN_MAX_DIM_NUM];
+        addr = (vx_trensor_addressing)malloc(sizeof(vx_tensorpatch_addressing_t));
+        addr->num_of_dims = (vx_uint32)attr->dim_num;
+        for(i = 0; i < dim; i++)
+        {
+            strides[i] = (vx_size)vstride[i];
+            dim_sizes[i] = (vx_size)attr->size[i];
+        }
+        addr->strides = strides;
+        addr->dim_sizes = dim_sizes;
+        if(attr->dtype.vx_type == VSI_NN_TYPE_INT4 || attr->dtype.vx_type == VSI_NN_TYPE_UINT4)
+        {
+           addr->strides[0] = 0;
+           addr->stride_x_bits = 4;
+        }
+        status = vxCopyTensorPatch2(tensor, dim, vstart, vend, addr,sizeof(vx_tensorpatch_addressing_t),
+                                    user_ptr, usage, user_memory_type);
+        if(addr)
+        {
+            free(addr);
+            addr = NULL;
+        }
+    }
+#else
     status = vxCopyTensorPatch(tensor, dim, vstart, vend, vstride, user_ptr, usage, user_memory_type);
+#endif
 #else
     {
         vx_context context = NULL;
@@ -2455,3 +2676,48 @@ vsi_bool vsi_nn_ConvertTensor
 
     return ret;
 }
+
+vsi_nn_tensor_t * vsi_nn_dropout_tensor
+    (
+    vsi_nn_graph_t  * graph,
+    vsi_nn_tensor_t * input,
+    float             rate
+    )
+{
+    vsi_nn_tensor_t *output = NULL;
+    vsi_size_t size = 0;
+    vsi_size_t i = 0;
+    float* data   = NULL;
+
+    if (NULL == input || NULL == graph)
+    {
+        return NULL;
+    }
+
+    output = vsi_nn_CreateTensor(graph, &input->attr);
+    if ( !output )
+    {
+        VSILOGE("create tensor failed.");
+        goto final;
+    }
+
+    data = vsi_nn_ConvertTensorToFloat32Data(graph, input);
+    if (NULL == data)
+    {
+        goto final;
+    }
+
+    size = vsi_nn_vxGetTensorElementNum(&input->attr);
+
+    for (i = 0; i < size; i++)
+    {
+        data[i] = data[i] * rate;
+    }
+
+    vsi_nn_CopyRawDataToTensor( graph, (uint8_t *)data, &input->attr.dtype, output );
+
+final:
+    vsi_nn_safe_free(data);
+
+    return output;
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/vsi_nn_version.c b/src/tim/vx/internal/src/vsi_nn_version.c
index d7abca3..ab94abe 100644
--- a/src/tim/vx/internal/src/vsi_nn_version.c
+++ b/src/tim/vx/internal/src/vsi_nn_version.c
@@ -50,4 +50,26 @@ uint32_t vsi_nn_GetVersionMinor(void)
 uint32_t vsi_nn_GetVersionPatch(void)
 {
     return VSI_NN_VERSION_PATCH;
-}
\ No newline at end of file
+}
+
+const char **vsi_nn_get_feature_config(void)
+{
+    static const char *p[10];
+    int i = 0;
+    #ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT
+    {
+        static const char *perchannel_quantization = MACRO_TO_STRING(VSI_PERCHANNEL_QUANTIZATION_SUPPORT);
+        (void)perchannel_quantization;
+        p[i++] = perchannel_quantization;
+    }
+    #endif
+
+    #ifdef VSI_40BIT_VA_SUPPORT
+    {
+        static const char *va40bit = MACRO_TO_STRING(VSI_40BIT_VA_SUPPORT);
+        (void)va40bit;
+        p[i++] = va40bit;
+    }
+    #endif
+    return p;
+}
diff --git a/src/tim/vx/internal/tim_internal.cmake b/src/tim/vx/internal/tim_internal.cmake
index b52a034..6f93896 100644
--- a/src/tim/vx/internal/tim_internal.cmake
+++ b/src/tim/vx/internal/tim_internal.cmake
@@ -15,12 +15,12 @@ aux_source_directory(./vx/internal/src/quantization INTERNAL_QUANTIZATION)
 aux_source_directory(./vx/internal/src/custom/ops INTERNAL_CUSTOM_OPS)
 aux_source_directory(./vx/internal/src/custom/ops/kernel INTERNAL_CUSTOM_OPS_KERNEL)
 aux_source_directory(./vx/internal/src/utils INTERNAL_UTILS)
+aux_source_directory(./vx/internal/src/POST POST)
 
 list(APPEND ${TARGET_NAME}_SRCS
     ${INTERNAL_SRC}
     ${INTERNAL_KERNEL}
     ${INTERNAL_KERNEL_CL}
-    ${INTERNAL_KERNEL_CPU}
     ${INTERNAL_KERNEL_EVIS}
     ${INTERNAL_KERNEL_VX}
     ${INTERNAL_OPS}
@@ -29,4 +29,5 @@ list(APPEND ${TARGET_NAME}_SRCS
     ${INTERNAL_CUSTOM_OPS}
     ${INTERNAL_CUSTOM_OPS_KERNEL}
     ${INTERNAL_UTILS}
+    ${POST}
 )