Updata internal ovxlib to rel1.2.6 (#690)

Internal ovxlib commit hash: c5d3e69356579fc7b595a7c0939fc7e4e0aaab5a Type: Code Improvement Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>
2024-04-02 19:56:46 +08:00 · 2024-04-02 19:56:46 +08:00 · 3b80968fb1
parent 3ea908ca6d
commit 3b80968fb1
69 changed files with 4878 additions and 590 deletions
--- a/2
+++ b/2
@ -1 +1 @@
-1.1.50
+1.2.6
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@ -197,3 +197,5 @@ DEF_OP(RESIZE_3D)
 DEF_OP(REDUCEL2)
 DEF_OP(CROP_AND_RESIZE)
 DEF_OP(TAN)
+DEF_OP(RMSNORM)
+DEF_OP(SHAPE)
--- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
+++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h
@ -26,6 +26,8 @@
 #define _VSI_NN_KERNEL_H

 #include <stdint.h>
+#include <stdarg.h>
+
 #include "vsi_nn_log.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_graph.h"
@ -81,6 +83,7 @@ typedef enum
    U4,
    FP8_E4M3,
    FP8_E5M2,
+    INVALID_DTYPE,
 } VSI_PUBLIC_TYPE vsi_nn_kernel_dtype_e;

 typedef enum
@ -532,9 +535,8 @@ static VSI_INLINE_API vsi_nn_kernel_dtype_e vsi_nn_kernel_map_dtype
        return FP8_E5M2;
    default:
        VSILOGE("error data type %d", dtype);
-        break;
+        return INVALID_DTYPE;
    }
-    return I8;
 } /* vsi_nn_kernel_map_dtype() */

 static VSI_INLINE_API  vsi_nn_type_e vsi_nn_dtype_map_kernel
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_resize_internal.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_resize_internal.h
@ -43,6 +43,7 @@ typedef struct _vsi_nn_resize_internal_param
    vsi_bool    half_pixel_centers;
    float       factor;
    vsi_enum    layout;
+    vsi_enum    type;
 } vsi_nn_resize_internal_param;

 #ifdef __cplusplus
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_rmsnorm.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_rmsnorm.h
@ -0,0 +1,54 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_RMSNORM_H
+#define _VSI_NN_OP_RMSNORM_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+typedef struct _rmsnorm_local_data_t {
+    int32_t placeholder;
+} rmsnorm_local_data_t;
+
+typedef struct _vsi_nn_rmsnorm_param
+{
+    struct _rmsnorm_local_data_t* local;
+    float eps;
+    int32_t axis;
+} vsi_nn_rmsnorm_param;
+
+_compiler_assert(offsetof(vsi_nn_rmsnorm_param, local) == 0, \
+    vsi_nn_rmsnorm_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/ops/vsi_nn_op_shape.h
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_shape.h
@ -0,0 +1,47 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_SHAPE_H
+#define _VSI_NN_OP_SHAPE_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_shape_param
+{
+    struct _shape_local_data_t* local;
+    // Add parameters here
+} vsi_nn_shape_param;
+_compiler_assert(offsetof(vsi_nn_shape_param, local) == 0, \
+    vsi_nn_shape_h );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
--- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h
@ -33,6 +33,14 @@
 extern "C" {
 #endif

+/*
+ * A helper union for fp32 bit casting.
+ */
+typedef union {
+    float val;
+    uint32_t data;
+} fp32_bit_cast_t;
+
 static VSI_INLINE_API vsi_bool type_is_integer
    (
    const vsi_nn_type_e type
@ -203,9 +211,11 @@ static VSI_INLINE_API vsi_bool fp32_is_inf
        float val
    )
 {
-    uint32_t u_value = *(uint32_t*)&val;
+    fp32_bit_cast_t fp32_bit_cast;
+    fp32_bit_cast.val = val;
+    uint32_t fp32_data = fp32_bit_cast.data;

-    if ((u_value & (uint32_t)VSI_NN_INT32_MAX) == (uint32_t)VSI_NN_FLOAT32_INF)
+    if ((fp32_data & (uint32_t)VSI_NN_INT32_MAX) == (uint32_t)VSI_NN_FLOAT32_INF)
    {
        return TRUE;
    }
@ -232,7 +242,9 @@ static VSI_INLINE_API int32_t fp32_to_affine

    if (fp32_is_inf(in) != 0)
    {
-        uint32_t sign = (*(uint32_t*)&in) >> 31;
+        fp32_bit_cast_t fp32_bit_cast;
+        fp32_bit_cast.val = in;
+        uint32_t sign = fp32_bit_cast.data >> 31;
        data = sign == 1 ? (int32_t)min_range : (int32_t)max_range;
    }

@ -277,7 +289,9 @@ static VSI_INLINE_API int32_t fp32_to_dfp

    if (fp32_is_inf(in) != 0)
    {
-        uint32_t sign = (*(uint32_t*)&in) >> 31;
+        fp32_bit_cast_t fp32_bit_cast;
+        fp32_bit_cast.val = in;
+        uint32_t sign = fp32_bit_cast.data >> 31;
        data = sign == 1 ? (int32_t)min_range : (int32_t) max_range;
    }

@ -373,8 +387,9 @@ static VSI_INLINE_API float bfp16_to_fp32
    int16_t in
    )
 {
-    int32_t t1, t2, t3;
+    uint32_t t1, t2, t3;
    float out;
+    fp32_bit_cast_t fp32_bit_cast;

    t1 = in & 0x00FF;                       // Mantissa
    t2 = in & 0xFF00;                       // Sign bit + Exponent
@ -384,9 +399,10 @@ static VSI_INLINE_API float bfp16_to_fp32
    t2 <<= 16;                              // Shift (sign + Exponent) bit into position
    t1 |= t2;                               // Re-insert (sign + Exponent) bit

-    *((uint32_t*)&out) = t1;
+    fp32_bit_cast.data = t1;
+    out = fp32_bit_cast.val;

-    return t3 == 0 ? 0 : out;
+    return t3 == 0 ? 0.0f : out;
 } /* bfp16_to_fp32() */

 static VSI_INLINE_API uint16_t fp32_to_fp16
@ -394,10 +410,12 @@ static VSI_INLINE_API uint16_t fp32_to_fp16
    float in
    )
 {
-    uint32_t fp32 = *((uint32_t *) &in);
-    uint32_t t1 = (fp32 & 0x80000000u) >> 16;  /* sign bit. */
-    uint32_t t2 = (fp32 & 0x7F800000u) >> 13;  /* Exponent bits */
-    uint32_t t3 = (fp32 & 0x007FE000u) >> 13;  /* Mantissa bits, no rounding */
+    fp32_bit_cast_t fp32_bit_cast;
+    fp32_bit_cast.val = in;
+    uint32_t fp32_data = fp32_bit_cast.data;
+    uint32_t t1 = (fp32_data & 0x80000000u) >> 16;  /* sign bit. */
+    uint32_t t2 = (fp32_data & 0x7F800000u) >> 13;  /* Exponent bits */
+    uint32_t t3 = (fp32_data & 0x007FE000u) >> 13;  /* Mantissa bits, no rounding */
    uint32_t fp16 = 0u;
    if( t2 >= 0x023c00u )
    {
@ -420,8 +438,10 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16
    float in
    )
 {
-    uint32_t fp32 = *((unsigned int *) &in);
-    uint32_t t1 = fp32 >> 16;
+    fp32_bit_cast_t fp32_bit_cast;
+    fp32_bit_cast.val = in;
+    uint32_t fp32_data = fp32_bit_cast.data;
+    uint32_t t1 = fp32_data >> 16;

    return (uint16_t) t1;
 } /* fp32_to_bfp16() */
@ -435,10 +455,12 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne
    Convert a float point to bfloat16, with round-nearest-to-even as rounding method.
    */

-    uint32_t fp32 = *((unsigned int *) &in);
+    fp32_bit_cast_t fp32_bit_cast;
+    fp32_bit_cast.val = in;
+    uint32_t fp32_data = fp32_bit_cast.data;
    uint16_t out;

-    uint32_t lsb = (fp32 >> 16) & 1;    /* Least significant bit of resulting bfloat. */
+    uint32_t lsb = (fp32_data >> 16) & 1;    /* Least significant bit of resulting bfloat. */
    uint32_t rounding_bias = 0x7fff + lsb;

    if ( VSI_NN_FLOAT32_NAN == in )
@ -447,8 +469,8 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne
    }
    else
    {
-        fp32 += rounding_bias;
-        out = (uint16_t) (fp32 >> 16);
+        fp32_data += rounding_bias;
+        out = (uint16_t) (fp32_data >> 16);
    }

    return out;
@ -466,7 +488,9 @@ static VSI_INLINE_API uint16_t fp32_to_bfp16_rtne

 static VSI_INLINE_API uint8_t fp32_to_fp8_e4m3(float in, const float scale) {
    float fp8_f32 = in / scale;
-    int32_t in_val = *((int32_t*)&fp8_f32);
+    fp32_bit_cast_t fp32_bit_cast;
+    fp32_bit_cast.val = fp8_f32;
+    uint32_t in_val = fp32_bit_cast.data;

    uint32_t in_sign = (in_val >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1; /* bit 31 is sign */
    uint32_t in_exp = (in_val >> FLOAT_MANTISSA_SIZE) & 0xFF; /* bit[30: 24] is exp */
@ -512,7 +536,9 @@ static VSI_INLINE_API uint8_t fp32_to_fp8_e4m3(float in, const float scale) {

 static VSI_INLINE_API uint8_t fp32_to_fp8_e5m2(float in, const float scale) {
    float fp8_f32 = in / scale;
-    int32_t in_val = *((int32_t*)&fp8_f32);
+    fp32_bit_cast_t fp32_bit_cast;
+    fp32_bit_cast.val = fp8_f32;
+    uint32_t in_val = fp32_bit_cast.data;
    uint32_t in_sign = (in_val >> (FLOAT_EXPONENT_SIZE + FLOAT_MANTISSA_SIZE)) & 0x1; /* bit 31 is sign */
    uint32_t in_exp = (in_val >> FLOAT_MANTISSA_SIZE) & 0xFF; /* bit[30: 24] is exp */
    uint32_t in_man = (in_val & 0x7FFFFF);   /* low 23 bits is man */
@ -561,6 +587,7 @@ static VSI_INLINE_API float fp8_e4m3_to_fp32(uint8_t in, const float scale) {
    uint32_t exponentOut = 0;
    uint32_t mantissaOut = 0;
    uint32_t out_u = 0;
+    fp32_bit_cast_t fp32_bit_cast;

    {
        uint32_t signIn;
@ -610,7 +637,8 @@ static VSI_INLINE_API float fp8_e4m3_to_fp32(uint8_t in, const float scale) {
    }
 final:
    out_u = signOut << 31 | exponentOut << 23 | mantissaOut;
-    val_fp32 = *((float*)&out_u);
+    fp32_bit_cast.data = out_u;
+    val_fp32 = fp32_bit_cast.val;

    return val_fp32 * scale;
 } /* fp8_e4m3_to_fp32() */
@ -621,6 +649,7 @@ static VSI_INLINE_API float fp8_e5m2_to_fp32(int8_t in, const float scale) {
    uint32_t exponentOut = 0;
    uint32_t mantissaOut = 0;
    uint32_t out_u = 0;
+    fp32_bit_cast_t fp32_bit_cast;

    {
        uint32_t signIn;
@ -670,7 +699,8 @@ static VSI_INLINE_API float fp8_e5m2_to_fp32(int8_t in, const float scale) {
    }
 final:
    out_u = signOut << 31 | exponentOut << 23 | mantissaOut;
-    val_fp32 = *((float*)&out_u);
+    fp32_bit_cast.data = out_u;
+    val_fp32 = fp32_bit_cast.val;
    return val_fp32 * scale;
 } /* fp8_e5m2_to_fp32() */

--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@ -60,9 +60,7 @@ typedef struct _vsi_nn_hw_config_t
 {
    char target_name[VSI_NN_MAX_TARGET_NAME];
    vsi_nn_hw_evis_t evis;
-#if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
    uint32_t subGroupSize;
-#endif
    uint32_t use_40bits_va;
    uint32_t support_stream_processor;
    uint32_t sp_exec_count;
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@ -210,6 +210,8 @@
 #include "ops/vsi_nn_op_resize_3d.h"
 #include "ops/vsi_nn_op_reducel2.h"
 #include "ops/vsi_nn_op_crop_and_resize.h"
+#include "ops/vsi_nn_op_rmsnorm.h"
+#include "ops/vsi_nn_op_shape.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"
 #include "ops/vsi_nn_op_inverse_sigmoid.h"
@ -408,6 +410,8 @@ typedef union _vsi_nn_nn_param
    vsi_nn_resize_3d_param          resize_3d;
    vsi_nn_reducel2_param           reducel2;
    vsi_nn_crop_and_resize_param    crop_and_resize;
+    vsi_nn_rmsnorm_param            rmsnorm;
+    vsi_nn_shape_param              shape;
    void*                         client_param;

    /* custom node data struct define */
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@ -33,7 +33,7 @@ extern "C"{

 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 2
-#define VSI_NN_VERSION_PATCH 2
+#define VSI_NN_VERSION_PATCH 6
 #define VSI_NN_VERSION \
    (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)

--- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_confidence_evis.c
+++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_tiny_yolov4_postprocess_confidence_evis.c
@ -267,7 +267,7 @@ static vsi_nn_kernel_node_t _setup
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_node_param_t node_params[_TINY_YOLOV4_POSTPROCESS_CONFIDENCE_PARAM_NUM];
    vsi_nn_kernel_node_t node = NULL;
-    vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = { 0 };
+    vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = { { 0 } };
    vsi_nn_tensor_t* reshape_tensors[2] = { NULL };

    VSI_UNREFERENCED(params);
--- a/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c
+++ b/src/tim/vx/internal/src/custom/ops/vsi_nn_op_custom_warp_affine.c
@ -55,8 +55,17 @@ static vsi_status op_compute
    vsi_nn_kernel_param_t * param = NULL;
    vsi_nn_custom_warp_affine_param * p;
    p = &(self->nn_param.custom_warp_affine);
-
+    float matrix_shape[6] = { 1.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f};
    param = vsi_nn_kernel_param_create();
+    //Unlike OpenCV, we use the coordinate of dst and matrix to calculate the coordinate of src in custom_warp_affine.
+    //Therefore, matrix M_ovx in custom_warp_affine is different from matrix M_cv in OpenCV.
+    //We get M_ovx by transposing the inverse of M_cv.
+    //inv_M = cv2.invertAffineTransform(M_cv); M_ovx=inv_M.transpose(1,0)
+    if (p->matrix == NULL)
+    {
+        p->matrix = matrix_shape;
+    }
+
    vsi_nn_kernel_param_add_const_buffer( param, "matrix", p->matrix, 6 );
    vsi_nn_kernel_param_add_int32( param, "type", p->type);
    vsi_nn_kernel_param_add_int32( param, "rgb_type", p->rgb_type);
--- a/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/bilinear_grid_sample_cl.c
@ -47,18 +47,26 @@ typedef enum
 } _internal_kernel_e;

 #define _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE()      "bilinear_grid_sample"
+#define _BILINEAR_GRID_SAMPLE_REFLECT_KERNEL_SOURCE()      "bilinear_grid_sample_reflect"

 #define STR(a) #a

 // Add kernel hashtable here
-#define BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
-    ((IN1_DTYPE << 20) | (IN0_DTYPE << 8) | (OUT_DTYPE))
+#define BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, REFLECT) \
+    ((IN1_DTYPE << 24) | (IN0_DTYPE << 16) | (OUT_DTYPE << 8) | (REFLECT))

 #define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE)                \
    {                                                                   \
-        BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \
-            CVIVANTE_NAMESPACE("cl.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
-            _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE()   \
+        BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0), \
+        CVIVANTE_NAMESPACE("cl.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
+        _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE()   \
+    }
+
+#define PACK_REFLECT_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE)                \
+    {                                                                   \
+        BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1), \
+        CVIVANTE_NAMESPACE("cl.bilinear_grid_sample_reflect_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
+        _BILINEAR_GRID_SAMPLE_REFLECT_KERNEL_SOURCE()   \
    }

 typedef struct
@ -73,6 +81,8 @@ static const _kernel_map_type _bilinear_grid_sample_kernel_map[] =
    // Register kernel here
    PACK_KERNEL_MAP(F32, F32, F32 ),
    PACK_KERNEL_MAP(U8,  U8,  U8),
+    PACK_REFLECT_KERNEL_MAP(F32, F32, F32),
+    PACK_REFLECT_KERNEL_MAP(U8,  U8,  U8),
 };


@ -95,23 +105,24 @@ static vx_param_description_t _bilinear_grid_sample_kernel_param_def[] =
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };

 #define _BILINEAR_GRID_SAMPLE_PARAM_NUM 8
-#define _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM \
+#define _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM 14
+#define _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_NUM 12
+#define _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_QUANT_NUM \
    _cnt_of_array(_bilinear_grid_sample_kernel_param_def)

+
 #define SCALAR_HALF_INPUT0_W (3)
 #define SCALAR_HALF_INPUT0_H (4)
 #define SCALAR_ADD_VALUE_W   (5)
 #define SCALAR_ADD_VALUE_H   (6)
 #define SCALAR_DEPTH         (7)
-#define SCALAR_INPUT0_SCALE  (8)
-#define SCALAR_INPUT0_TAIL   (9)
-#define SCALAR_INPUT1_SCALE  (10)
-#define SCALAR_INPUT1_TAIL   (11)
-#define SCALAR_OUTPUT_SCALE  (12)
-#define SCALAR_OUTPUT_TAIL   (13)

 /*
 * Kernel initializer
@ -170,7 +181,8 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t * kernel,
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs,
-    vsi_bool* is_use_u8_kernel
+    vsi_bool* is_use_u8_kernel,
+    int32_t is_reflect_mode
    )
 {
    vsi_status status = VSI_FAILURE;
@ -199,14 +211,29 @@ static vsi_status _query_kernel
        out_dtype = F32;
    }
    if ((U8 == in0_dtype) || (U8 == out_dtype)) {
-        param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM;
+        if (is_reflect_mode)
+        {
+            param_def_size = _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_QUANT_NUM;
+        }
+        else{
+            param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM;
+        }
+
        *is_use_u8_kernel = TRUE;
    } else {
-        param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_NUM;
+        if (is_reflect_mode)
+        {
+            param_def_size = _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_NUM;
+        }
+        else
+        {
+            param_def_size = _BILINEAR_GRID_SAMPLE_PARAM_NUM;
+        }
+
        *is_use_u8_kernel = FALSE;
    }

-    key = BILINEAR_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype);
+    key = BILINEAR_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype, is_reflect_mode);

    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
    {
@ -245,7 +272,7 @@ static vsi_nn_kernel_node_t _setup
 {
    vsi_nn_kernel_node_t node = NULL;
    vsi_status status = VSI_FAILURE;
-    vsi_nn_kernel_node_param_t node_params[_BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM];
+    vsi_nn_kernel_node_param_t node_params[_BILINEAR_GRID_SAMPLE_REFLECT_PARAM_QUANT_NUM];
    vsi_size_t final_shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
    uint32_t final_in1_rank = 0;
    vsi_nn_tensor_t* rs_tensors = NULL;
@ -263,11 +290,14 @@ static vsi_nn_kernel_node_t _setup
    vsi_bool is_use_u8_kernel = FALSE;
    int32_t align_corners =
        vsi_nn_kernel_param_get_int32(params, "align_corners");
+    int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "padding_mode");
    uint32_t pad_val = 0;
    int32_t  depth = 0;
    vsi_nn_kernel_dtype_e in0_dtype;

    float half_input0_w, half_input0_h, add_float_value_w, add_float_value_h;
+    int32_t is_reflect_mode = 0;
+    float min_val_w, span_w, min_val_h, span_h;

    // Check if gpu can support the size
    if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size,
@ -280,6 +310,11 @@ static vsi_nn_kernel_node_t _setup
        return NULL;
    }

+    if (pad_mode == VSI_NN_PAD_MODE_REFLECT)
+    {
+        is_reflect_mode = 1;
+    }
+
    final_tensors[0] = inputs[0];

    if (inputs[1]->attr.dim_num >= 3) {
@ -313,12 +348,35 @@ static vsi_nn_kernel_node_t _setup
        add_float_value_h = half_input0_h - 0.5f;
    }

+    if (is_reflect_mode)
+    {
+        float low_w, low_h, high_w, high_h;
+        if (align_corners)
+        {
+            low_w = 0;
+            low_h = 0;
+            high_w = 2 * (float)(in0_width - 1);
+            high_h = 2 * (float)(in0_height - 1);
+        }
+        else
+        {
+            low_w = -1;
+            low_h = -1;
+            high_w = 2 * (float)in0_width - 1;
+            high_h = 2 * (float)in0_height - 1;
+        }
+        min_val_w = low_w / 2;
+        span_w = (high_w - low_w) / 2;
+        min_val_h = low_h / 2;
+        span_h = (high_h - low_h) / 2;
+    }
+
    depth = (int32_t)inputs[0]->attr.size[2];
    in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
    if (U8 == in0_dtype) {
        pad_val = inputs[0]->attr.dtype.zero_point;
    }
-    status = _query_kernel(kernel, inputs, outputs, &is_use_u8_kernel);
+    status = _query_kernel(kernel, inputs, outputs, &is_use_u8_kernel, is_reflect_mode);
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
@ -326,7 +384,7 @@ static vsi_nn_kernel_node_t _setup
        {
            size_t node_params_num = _BILINEAR_GRID_SAMPLE_PARAM_NUM;
            /* Set inputs and outputs */
-            vsi_nn_kernel_node_pack_io( node_params, _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM,
+            vsi_nn_kernel_node_pack_io( node_params, _BILINEAR_GRID_SAMPLE_REFLECT_PARAM_QUANT_NUM,
                    final_tensors, input_num, &final_tensors[2], output_num );
            node_params[SCALAR_HALF_INPUT0_W] = vsi_nn_kernel_scalar_create( graph, F32, &half_input0_w );
            node_params[SCALAR_HALF_INPUT0_H] = vsi_nn_kernel_scalar_create( graph, F32, &half_input0_h );
@ -335,13 +393,19 @@ static vsi_nn_kernel_node_t _setup
            node_params[SCALAR_DEPTH]         = vsi_nn_kernel_scalar_create( graph, I32, &depth );
            if (is_use_u8_kernel)
            {
-                node_params[SCALAR_INPUT0_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input0_scale );
-                node_params[SCALAR_INPUT0_TAIL]  = vsi_nn_kernel_scalar_create( graph, F32, &input0_tail );
-                node_params[SCALAR_INPUT1_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &input1_scale );
-                node_params[SCALAR_INPUT1_TAIL]  = vsi_nn_kernel_scalar_create( graph, F32, &input1_tail );
-                node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
-                node_params[SCALAR_OUTPUT_TAIL]  = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
-                node_params_num = _BILINEAR_GRID_SAMPLE_PARAM_QUANT_NUM;
+                node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &input0_scale );
+                node_params[node_params_num++]  = vsi_nn_kernel_scalar_create( graph, F32, &input0_tail );
+                node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &input1_scale );
+                node_params[node_params_num++]  = vsi_nn_kernel_scalar_create( graph, F32, &input1_tail );
+                node_params[node_params_num++] = vsi_nn_kernel_scalar_create( graph, F32, &output_scale );
+                node_params[node_params_num++]  = vsi_nn_kernel_scalar_create( graph, F32, &output_zp );
+            }
+            if (is_reflect_mode)
+            {
+                node_params[node_params_num++] = vsi_nn_kernel_scalar_create(graph, F32, &min_val_w);
+                node_params[node_params_num++] = vsi_nn_kernel_scalar_create(graph, F32, &span_w);
+                node_params[node_params_num++] = vsi_nn_kernel_scalar_create(graph, F32, &min_val_h);
+                node_params[node_params_num++] = vsi_nn_kernel_scalar_create(graph, F32, &span_h);
            }
            /* Pass parameters to node. */
            status  = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num );
@ -351,19 +415,34 @@ static vsi_nn_kernel_node_t _setup
            vsi_nn_kernel_scalar_release(&node_params[SCALAR_ADD_VALUE_W]);
            vsi_nn_kernel_scalar_release(&node_params[SCALAR_ADD_VALUE_H]);
            vsi_nn_kernel_scalar_release(&node_params[SCALAR_DEPTH]);
+            node_params_num = _BILINEAR_GRID_SAMPLE_PARAM_NUM;
            if (is_use_u8_kernel) {
-                vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT0_SCALE]);
-                vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT0_TAIL]);
-                vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT1_SCALE]);
-                vsi_nn_kernel_scalar_release(&node_params[SCALAR_INPUT1_TAIL]);
-                vsi_nn_kernel_scalar_release(&node_params[SCALAR_OUTPUT_SCALE]);
-                vsi_nn_kernel_scalar_release(&node_params[SCALAR_OUTPUT_TAIL]);
+                vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
+                vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
+                vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
+                vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
+                vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
+                vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
+            }
+            if (is_reflect_mode)
+            {
+                vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
+                vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
+                vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
+                vsi_nn_kernel_scalar_release(&node_params[node_params_num++]);
            }
            {
                // Set default border mode.
                vx_border_t border;
-                border.mode = VX_BORDER_CONSTANT;
-                border.constant_value.U32 = pad_val;
+                if (pad_mode == VSI_NN_PAD_MODE_CONSTANT)
+                {
+                    border.mode = VX_BORDER_CONSTANT;
+                    border.constant_value.U32 = pad_val;
+                }
+                else
+                {
+                    border.mode = VX_BORDER_REPLICATE;
+                }
                status = vxSetNodeAttribute(
                    (vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
                CHECK_STATUS(status);
--- a/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_elements_cl.c
@ -244,7 +244,10 @@ DEF_KERNEL_INITIALIZER(_gather_elements_initializer)

 final:
 #define SAFE_FREE_TENSOR_ATTR(_PTR) if( _PTR ) { vsi_nn_kernel_tensor_attr_release( &_PTR ); _PTR = NULL; }
+    SAFE_FREE_TENSOR_ATTR(input_attr0);
+    SAFE_FREE_TENSOR_ATTR(input_attr1);
    SAFE_FREE_TENSOR_ATTR(output_attr);
+
    return status;
 } /* _gather_elements_initializer() */

--- a/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/lstmunit_activation_cl.c
@ -34,6 +34,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_dtype_util_prv.h"
 #include "kernel/vsi_nn_kernel.h"

 __BEGIN_DECLS
@ -1489,8 +1490,8 @@ static vsi_nn_kernel_node_t _setup
    float    twoLogE                = 2 * logE;
    uint32_t uint_min               = 0xFBFFFFFF;
    uint32_t uint_max               = 0x7BFFFFFF;
-    float    float_min              = *(vx_float32 *)&uint_min;
-    float    float_max              = *(vx_float32 *)&uint_max;
+    float    float_min              = 0.0f;
+    float    float_max              = 0.0f;
    float    scale_val[9]           = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
    float    tail_val[9]            = {0, 0, 0, 0, 0, 0, 0, 0, 0};
    vsi_bool is_u8_type = FALSE;
@ -1499,6 +1500,12 @@ static vsi_nn_kernel_node_t _setup
    size_t   lstm_activation_in_out_num = 0;
    uint32_t i;

+    fp32_bit_cast_t fp32_bit_cast;
+    fp32_bit_cast.data = uint_min;
+    float_min = fp32_bit_cast.val;
+    fp32_bit_cast.data = uint_max;
+    float_max = fp32_bit_cast.val;
+
    _is_ln               = vsi_nn_kernel_param_get_int32( params, "_is_ln" );
    _is_cifg             = vsi_nn_kernel_param_get_int32( params, "_is_cifg" );
    _is_proj             = vsi_nn_kernel_param_get_int32( params, "_is_proj" );
--- a/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c
@ -266,6 +266,7 @@ static vsi_nn_kernel_node_t _setup
    vsi_bool is_use_u8_kernel = FALSE;
    int32_t align_corners =
        vsi_nn_kernel_param_get_int32(params, "align_corners");
+    int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "padding_mode");
    uint32_t pad_val = 0;
    int32_t depth = 0;
    vsi_nn_kernel_dtype_e in0_dtype;
@ -282,6 +283,11 @@ static vsi_nn_kernel_node_t _setup
        return NULL;
    }

+    if (pad_mode == VSI_NN_PAD_MODE_REFLECT)
+    {
+        return NULL;
+    }
+
    final_tensors[0] = inputs[0];
    if (inputs[1]->attr.dim_num >= 3) {
        final_shape[0] = inputs[1]->attr.size[1] * inputs[1]->attr.size[0];
@ -382,8 +388,15 @@ static vsi_nn_kernel_node_t _setup
            {
                // Set default border mode.
                vx_border_t border;
-                border.mode = VX_BORDER_CONSTANT;
-                border.constant_value.U32 = pad_val;
+                if (pad_mode == VSI_NN_PAD_MODE_CONSTANT)
+                {
+                    border.mode = VX_BORDER_CONSTANT;
+                    border.constant_value.U32 = pad_val;
+                }
+                else
+                {
+                    border.mode = VX_BORDER_REPLICATE;
+                }
                status = vxSetNodeAttribute(
                    (vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
                CHECK_STATUS(status);
--- a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c
@ -36,6 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_dtype_util.h"
+#include "utils/vsi_nn_dtype_util_prv.h"

 __BEGIN_DECLS

@ -242,6 +243,7 @@ static vsi_nn_kernel_node_t _setup
    vsi_size_t suffix_dim_size = 0;
    int32_t depth = vsi_nn_kernel_param_get_int32( params, "depth" );
    vsi_nn_kernel_dtype_e out_dtype;
+    fp32_bit_cast_t fp32_bit_cast;
    uint32_t data[2] = {0};
    float on_value = vsi_nn_kernel_param_get_float32( params, "on_value" );
    float off_value = vsi_nn_kernel_param_get_float32( params, "off_value" );
@ -258,8 +260,11 @@ static vsi_nn_kernel_node_t _setup
    }
    else
    {
-        data[0] = *(uint32_t*)&on_value;
-        data[1] = *(uint32_t*)&off_value;
+        fp32_bit_cast.val = on_value;
+        data[0] = fp32_bit_cast.data;
+
+        fp32_bit_cast.val = off_value;
+        data[1] = fp32_bit_cast.data;
    }

    axis = axis == -1 ? (int32_t)inputs[0]->attr.dim_num : (int32_t)inputs[0]->attr.dim_num - axis;
--- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
@ -34,20 +34,24 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"

 __BEGIN_DECLS

 #define _TOPK_KERNEL_SOURCE      "topk"
 #define STR(a) #a
 // Add kernel hashtable here
-#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ) \
-        ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) )
+#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, SECTION ) \
+        ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) | (SECTION << 26))
 #define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, STAGES ) \
-        { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ), \
+        { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, 0 ), \
          CVIVANTE_NAMESPACE("cl.topk_stage"STR(STAGES)"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
          _TOPK_KERNEL_SOURCE }

+#define PACK_MERGE_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+        { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, 1 ), \
+          CVIVANTE_NAMESPACE("cl.topk_stage_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
+          "topk2" }
+
 #define TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
        ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) )
 #define PACK_ODD_EVEN_SORT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
@ -111,6 +115,9 @@ static const _kernel_map_type _topk_kernel_map[] =
    PACK_KERNEL_MAP( F32, I32, 4 ),
    PACK_KERNEL_MAP( F32, I32, 5 ),
    PACK_KERNEL_MAP( F32, I32, 6 ),
+
+    PACK_MERGE_KERNEL_MAP(U32, U32),
+    PACK_MERGE_KERNEL_MAP(I32, I32),
 };

 static const _kernel_map_type _topk_odd_even_sort_kernel_map[] =
@ -254,7 +261,8 @@ static vsi_status _query_kernel
    vsi_nn_kernel_t * kernel,
    vsi_nn_tensor_t * const * const inputs,
    vsi_nn_tensor_t * const * const outputs,
-    int32_t num_stages
+    int32_t num_stages,
+    vsi_bool is_bitnoic_segment
    )
 {
    vsi_status status = VSI_FAILURE;
@ -272,21 +280,23 @@ static vsi_status _query_kernel
    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );

+    num_stages = is_bitnoic_segment ? 0 : num_stages;
+
    switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
    {
    case _PACK_SELECT_KEY(F32, F32):
    case _PACK_SELECT_KEY(F16, F16):
-        key = TOPK_HASH_KEY( F32, F32, num_stages );
+        key = TOPK_HASH_KEY( F32, F32, num_stages, is_bitnoic_segment );
        break;
    case _PACK_SELECT_KEY(U32, U32):
    case _PACK_SELECT_KEY(U16, U16):
    case _PACK_SELECT_KEY(U8,  U8):
-        key = TOPK_HASH_KEY( U32, U32, num_stages );
+        key = TOPK_HASH_KEY( U32, U32, num_stages, is_bitnoic_segment );
        break;
    case _PACK_SELECT_KEY(I32, I32):
    case _PACK_SELECT_KEY(I16, I16):
    case _PACK_SELECT_KEY(I8,  I8):
-        key = TOPK_HASH_KEY( I32, I32, num_stages );
+        key = TOPK_HASH_KEY( I32, I32, num_stages, is_bitnoic_segment );
        break;
    case _PACK_SELECT_KEY(F32, U32):
    case _PACK_SELECT_KEY(F16, U32):
@ -294,7 +304,7 @@ static vsi_status _query_kernel
    case _PACK_SELECT_KEY(F16, U16):
    case _PACK_SELECT_KEY(F32, U8):
    case _PACK_SELECT_KEY(F16, U8):
-        key = TOPK_HASH_KEY( F32, U32, num_stages );
+        key = TOPK_HASH_KEY( F32, U32, num_stages, is_bitnoic_segment );
        break;
    case _PACK_SELECT_KEY(F32, I32):
    case _PACK_SELECT_KEY(F16, I32):
@ -302,7 +312,7 @@ static vsi_status _query_kernel
    case _PACK_SELECT_KEY(F16, I16):
    case _PACK_SELECT_KEY(F32, I8):
    case _PACK_SELECT_KEY(F16, I8):
-        key = TOPK_HASH_KEY( F32, I32, num_stages );
+        key = TOPK_HASH_KEY( F32, I32, num_stages, is_bitnoic_segment );
        break;
    default:
        break;
@ -440,7 +450,12 @@ static vsi_nn_kernel_node_t _setup
    int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
    int32_t num_stages = (int32_t)vsi_nn_max(ceil(log10(block_size / 2.0f) / log10(2.0f)), 0);
    vsi_bool is_odd_even_sort = FALSE;
+    vsi_bool is_bitnoic_segment = FALSE;
    size_t param_num = _TOPK_PARAM_NUM;
+    int32_t max_stages = 7 + (int32_t)log2(graph->ctx->config.subGroupSize >> 2);
+    vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
    float inputScale  = vsi_nn_get_tensor_scale(inputs[0]);
    float inputTail   = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
@ -471,9 +486,14 @@ static vsi_nn_kernel_node_t _setup
    rs_tensors[0] = vsi_nn_reshape_tensor( graph,
        inputs[0], shape[0], 2 );

-    if (num_stages < 7)
+    is_bitnoic_segment = (num_stages >= 9) && (top_k <= 512 && max_stages > 9) &&
+        type0 == type1 && (type0 == U8 || type0 == I8 || type0 == I16 || type0 == U16 || type0 == I32 || type0 == U32);
+    num_stages = is_bitnoic_segment ? 9 : num_stages;
+    max_stages = is_bitnoic_segment ? max_stages : 7;
+
+    if (num_stages < max_stages || is_bitnoic_segment)
    {
-        status = _query_kernel( kernel, inputs, outputs, num_stages );
+        status = _query_kernel( kernel, inputs, outputs, num_stages, is_bitnoic_segment );

        rs_tensors[1] = vsi_nn_reshape_tensor( graph,
            outputs[0], shape[1], 2 );
--- a/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/bilinear_grid_sample_evis.c
@ -52,15 +52,26 @@ typedef enum
 #define _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE(_input_type, _output_type) \
    "bilinear_grid_sample_" #_input_type "_to_" #_output_type

+#define _BILINEAR_GRID_SAMPLE_REFLECT_KERNEL_SOURCE(_input_type, _output_type) \
+    "bilinear_grid_sample_reflect_" #_input_type "_to_" #_output_type
+
 // Add kernel hashtable here
-#define BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
-        ((IN1_DTYPE << 20) | (IN0_DTYPE << 8) | (OUT_DTYPE))
+#define BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, REFLECT) \
+    ((IN1_DTYPE << 24) | (IN0_DTYPE << 16) | (OUT_DTYPE << 8) | (REFLECT))
+
 #define PACK_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
-        {                                                                   \
-        BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE), \
-        CVIVANTE_NAMESPACE("evis.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
-        _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE)     \
-        }
+    {                                                                   \
+    BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 0), \
+    CVIVANTE_NAMESPACE("evis.bilinear_grid_sample_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
+    _BILINEAR_GRID_SAMPLE_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE)     \
+    }
+
+#define PACK_REFLECT_KERNEL_MAP(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE) \
+    {                                                                   \
+    BILINEAR_GRID_SAMPLE_HASH_KEY(IN0_DTYPE, IN1_DTYPE, OUT_DTYPE, 1), \
+    CVIVANTE_NAMESPACE("evis.bilinear_grid_sample_reflect_" STR(IN0_DTYPE) "_" STR(IN1_DTYPE) "to" STR(OUT_DTYPE)), \
+    _BILINEAR_GRID_SAMPLE_REFLECT_KERNEL_SOURCE(IN0_DTYPE, OUT_DTYPE)     \
+    }

 typedef struct
 {
@ -83,6 +94,18 @@ static const _kernel_map_type _bilinear_grid_sample_kernel_map[] =
    PACK_KERNEL_MAP(I16,  I16,  I16),
    PACK_KERNEL_MAP(I8,   I8,   I8),
    PACK_KERNEL_MAP(BF16, BF16, BF16),
+    PACK_REFLECT_KERNEL_MAP(F16,  F32,  F16),
+    PACK_REFLECT_KERNEL_MAP(F16,  U8,   F16),
+    PACK_REFLECT_KERNEL_MAP(F16,  F16,  F16),
+    PACK_REFLECT_KERNEL_MAP(F16,  F32,  U8),
+    PACK_REFLECT_KERNEL_MAP(F16,  F16,  U8),
+    PACK_REFLECT_KERNEL_MAP(F16,  U8,   U8),
+    PACK_REFLECT_KERNEL_MAP(U8,   U8,   U8),
+    PACK_REFLECT_KERNEL_MAP(U8,   F16,  U8),
+    PACK_REFLECT_KERNEL_MAP(U8,   F32,  U8),
+    PACK_REFLECT_KERNEL_MAP(I16,  I16,  I16),
+    PACK_REFLECT_KERNEL_MAP(I8,   I8,   I8),
+    PACK_REFLECT_KERNEL_MAP(BF16, BF16, BF16),
 };


@ -96,18 +119,20 @@ static vx_param_description_t _bilinear_grid_sample_kernel_param_def[] =
    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
    {VX_INPUT,  VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
 };
-#define _BILINEAR_GRID_SAMPLE_PARAM_NUM  _cnt_of_array( _bilinear_grid_sample_kernel_param_def )
+#define _BILINEAR_GRID_SAMPLE_PARAM_NUM \
+_cnt_of_array( _bilinear_grid_sample_kernel_param_def )

 #define SCALAR_ALIGN_CORNERS (3)

 /*
 * Kernel initializer
 */
-DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
+static vsi_status _bilinear_grid_sample_initializer_base
    (
    vsi_nn_kernel_node_t                node,
    const vsi_nn_kernel_node_param_t  * param,
-    size_t                              param_size
+    size_t                              param_size,
+    vsi_bool                            is_reflect_mode
    )
 {
    vsi_status status = VSI_FAILURE;
@ -135,6 +160,8 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
    int32_t input1ZP      = 0;
    float   output_scale  = 1.0;
    int32_t outputZP      = 0;
+    float min_val_wh[4] = { 0 };
+    float span_wh[4] = { 0 };

    VSI_UNREFERENCED(param_size);

@ -156,6 +183,7 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
        (vsi_nn_kernel_scalar_t)param[SCALAR_ALIGN_CORNERS], &(align_corners));
    CHECK_STATUS_FAIL_GOTO(status, final);

+
    out_shape = output_attr->shape;
    in0_shape = input_attr[0]->shape;
    input0_dtype = input_attr[0]->dtype;
@ -193,6 +221,35 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
    status |= vsi_nn_kernel_gpu_add_param(node, "add_float_value", add_float_value);
    status |= vsi_nn_kernel_gpu_add_param(node, "depth", &depth);

+    if (is_reflect_mode)
+    {
+        float low_w, low_h, high_w, high_h;
+        if (align_corners)
+        {
+            low_w = 0;
+            low_h = 0;
+            high_w = 2 * (float)(in0_width - 1);
+            high_h = 2 * (float)(in0_height - 1);
+        }
+        else
+        {
+            low_w = -1;
+            low_h = -1;
+            high_w = 2 * (float)in0_width - 1;
+            high_h = 2 * (float)in0_height - 1;
+        }
+        min_val_wh[0] = low_w / 2;
+        span_wh[0] = (high_w - low_w) / 2;
+        min_val_wh[1] = low_h / 2;
+        span_wh[1] = (high_h - low_h) / 2;
+        min_val_wh[2] = min_val_wh[0];
+        min_val_wh[3] = min_val_wh[1];
+        span_wh[2] = span_wh[0];
+        span_wh[3] = span_wh[1];
+        status |= vsi_nn_kernel_gpu_add_param(node, "span_wh", span_wh);
+        status |= vsi_nn_kernel_gpu_add_param(node, "min_val_wh", min_val_wh);
+    }
+
    {
        gpu_dp_inst_t uniFp16toFp32_part0_4x4 = {
            {
@ -538,6 +595,28 @@ DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)



+DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_initializer)
+(
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t* param,
+    size_t                              param_size
+    )
+{
+    return _bilinear_grid_sample_initializer_base(
+        node, param, param_size, vx_false_e);
+}
+
+DEF_KERNEL_INITIALIZER(_bilinear_grid_sample_reflect_initializer)
+(
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t* param,
+    size_t                              param_size
+    )
+{
+    return _bilinear_grid_sample_initializer_base(
+        node, param, param_size, vx_true_e);
+}
+
 /*
 * Query kernel
 */
@ -545,7 +624,8 @@ static vsi_status _query_kernel
    (
    vsi_nn_kernel_t * kernel,
    vsi_nn_tensor_t * const * const inputs,
-    vsi_nn_tensor_t * const * const outputs
+    vsi_nn_tensor_t * const * const outputs,
+    int32_t is_reflect_mode
    )
 {
    vsi_status status = VSI_FAILURE;
@ -563,7 +643,16 @@ static vsi_status _query_kernel
    in1_dtype = vsi_nn_kernel_map_dtype(inputs[1]->attr.dtype.vx_type);
    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );

-    key = BILINEAR_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype);
+    key = BILINEAR_GRID_SAMPLE_HASH_KEY(in0_dtype, in1_dtype, out_dtype, is_reflect_mode);
+
+    if (is_reflect_mode)
+    {
+        initializer = _bilinear_grid_sample_reflect_initializer;
+    }
+    else
+    {
+        initializer = _bilinear_grid_sample_initializer;
+    }

    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
    {
@ -605,13 +694,21 @@ static vsi_nn_kernel_node_t _setup
    vsi_nn_kernel_node_param_t node_params[_BILINEAR_GRID_SAMPLE_PARAM_NUM];
    vsi_nn_kernel_node_t node = NULL;
    vsi_size_t final_shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1};
-    uint32_t final_in1_rank = 0;
+    vsi_size_t final_out_shape[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 };
+    uint32_t final_in1_rank = 0, final_out_rank = 0;
    vsi_nn_tensor_t* rs_tensors = NULL;
+    vsi_nn_tensor_t* rs_out_tensors = NULL;
    vsi_nn_tensor_t* final_tensors[3] = {NULL};
    vsi_nn_kernel_dtype_e in0_dtype;
    uint32_t pad_val = 0;
    int32_t align_corners =
        vsi_nn_kernel_param_get_int32(params, "align_corners");
+    int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "padding_mode");
+    int32_t is_reflect_mode = 0;
+    vsi_size_t in_size_x = inputs[1]->attr.size[1];
+    vsi_size_t in_size_y = inputs[1]->attr.dim_num >= 3 ? inputs[1]->attr.size[2] : 1;
+    vsi_size_t new_size_x = in_size_x, new_size_y = in_size_y;
+    vsi_bool is_reshape_out = vx_false_e;

    // Check if gpu can support the size
    if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size,
@ -624,12 +721,63 @@ static vsi_nn_kernel_node_t _setup
        return NULL;
    }

+    if (pad_mode == VSI_NN_PAD_MODE_REFLECT)
+    {
+        is_reflect_mode = 1;
+    }
+
    final_tensors[0] = inputs[0];

+    is_reshape_out = vx_false_e;
    if (inputs[1]->attr.dim_num >= 3) {
+        vsi_size_t shape_x[2];
+        vsi_size_t out_shape_x[2];
+        vsi_size_t out_rank_x;
+        shape_x[0] = in_size_x;
+        shape_x[1] = in_size_y;
+        vsi_nn_kernel_optimize_element_shape(shape_x, 2, out_shape_x, &out_rank_x);
+        if (out_rank_x == 2)
+        {
+            new_size_x = out_shape_x[0];
+            new_size_y = out_shape_x[1];
+        }

-        final_shape[0] = inputs[1]->attr.size[1] * inputs[1]->attr.size[0];
-        final_shape[1] = inputs[1]->attr.size[2];
+        if ((new_size_x == in_size_x) && (new_size_y == in_size_y))
+        {
+            is_reshape_out = vx_false_e;
+        }
+        else if ((new_size_x * 2) >= GPU_TENSOR_MAX_WIDTH)
+        {
+            is_reshape_out = vx_false_e;
+        }
+        else
+        {
+            is_reshape_out = vx_true_e;
+        }
+
+        if (is_reshape_out == vx_false_e)
+        {
+            new_size_x = in_size_x;
+            new_size_y = in_size_y;
+            if ((new_size_x < new_size_y) && ((new_size_y * 2) < GPU_TENSOR_MAX_WIDTH))
+            {
+                vsi_size_t tmp = new_size_x;
+                new_size_x = new_size_y;
+                new_size_y = tmp;
+                is_reshape_out = vx_true_e;
+            }
+        }
+
+    }
+
+    if (((new_size_x * 2) >= GPU_TENSOR_MAX_WIDTH) || (new_size_y >= GPU_TENSOR_MAX_WIDTH))
+    {
+        return NULL;
+    }
+
+    if (inputs[1]->attr.dim_num >= 3) {
+        final_shape[0] = new_size_x * inputs[1]->attr.size[0];
+        final_shape[1] = new_size_y;
        final_shape[2] = 1;
        final_shape[3] = inputs[1]->attr.dim_num > 3 ? inputs[1]->attr.size[3] : 1;
        final_in1_rank =
@ -643,14 +791,32 @@ static vsi_nn_kernel_node_t _setup
    } else {
        final_tensors[1] = inputs[1];
    }
-    final_tensors[2] = outputs[0];
+
+    if (is_reshape_out)
+    {
+        final_out_shape[0] = new_size_x;
+        final_out_shape[1] = new_size_y;
+        final_out_shape[2] = outputs[0]->attr.dim_num > 2 ? outputs[0]->attr.size[2] : 1;
+        final_out_shape[3] = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1;
+        final_out_rank = outputs[0]->attr.dim_num;
+        if (!vsi_nn_kernel_gpu_check_shape(final_out_shape, final_out_rank)) {
+            return NULL;
+        }
+
+        rs_out_tensors = vsi_nn_reshape_tensor(graph, outputs[0], final_out_shape, final_out_rank);
+        final_tensors[2] = rs_out_tensors;
+    }
+    else
+    {
+        final_tensors[2] = outputs[0];
+    }

    in0_dtype = vsi_nn_kernel_map_dtype(inputs[0]->attr.dtype.vx_type);
    if (U8 == in0_dtype) {
        pad_val = inputs[0]->attr.dtype.zero_point;
    }

-    status = _query_kernel( kernel, inputs, outputs );
+    status = _query_kernel( kernel, inputs, outputs, is_reflect_mode);
    if ( VSI_SUCCESS == status)
    {
        node = vsi_nn_kernel_create_node( graph, kernel );
@ -662,14 +828,22 @@ static vsi_nn_kernel_node_t _setup
            node_params[SCALAR_ALIGN_CORNERS] =
                vsi_nn_kernel_scalar_create(graph, I32, &align_corners);
            /* Pass parameters to node. */
-            status  = vsi_nn_kernel_node_pass_param( node, node_params, _BILINEAR_GRID_SAMPLE_PARAM_NUM );
+            status  = vsi_nn_kernel_node_pass_param(
+                node, node_params, _BILINEAR_GRID_SAMPLE_PARAM_NUM );
            VSI_ASSERT(status == VSI_SUCCESS);
            vsi_nn_kernel_scalar_release(&node_params[SCALAR_ALIGN_CORNERS]);
            {
                // Set default border mode.
                vx_border_t border;
-                border.mode = VX_BORDER_CONSTANT;
-                border.constant_value.U32 = pad_val;
+                if (pad_mode == VSI_NN_PAD_MODE_CONSTANT)
+                {
+                    border.mode = VX_BORDER_CONSTANT;
+                    border.constant_value.U32 = pad_val;
+                }
+                else
+                {
+                    border.mode = VX_BORDER_REPLICATE;
+                }
                status = vxSetNodeAttribute(
                    (vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
                CHECK_STATUS(status);
@ -678,6 +852,7 @@ static vsi_nn_kernel_node_t _setup
    }

    vsi_safe_release_tensor(rs_tensors);
+    vsi_safe_release_tensor(rs_out_tensors);

    return node;
 } /* _setup() */
--- a/src/tim/vx/internal/src/kernel/evis/crop_and_resize_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/crop_and_resize_evis.c
@ -513,6 +513,8 @@ static vsi_nn_kernel_node_t _setup
            vsi_nn_kernel_scalar_release( &node_params[4] );
            vsi_nn_kernel_scalar_release( &node_params[5] );
        }
+
+        if ( node )
        {
            // Set default border mode.
            vx_border_t border;
--- a/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/lstmunit_activation_evis.c
@ -34,6 +34,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_dtype_util_prv.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "libnnext/vx_lib_nnext.h"

@ -1002,8 +1003,8 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
    float                        twoLogE                = 2 * logE;
    uint32_t                     uint_min               = 0xFBFFFFFF;
    uint32_t                     uint_max               = 0x7BFFFFFF;
-    float                        float_min              = *(float *)&uint_min;
-    float                        float_max              = *(float *)&uint_max;
+    float                        float_min              = 0.0f;
+    float                        float_max              = 0.0f;
    float                        clip_Min_F[4]          = {0};
    float                        clip_Max_F[4]          = {0};
    uint32_t                     i                      = 0;
@ -1017,6 +1018,12 @@ DEF_KERNEL_INITIALIZER(_lstmunit_activation_initializer)
    vsi_nn_kernel_tensor_attr_t* input_attr[9]          = {NULL};
    vsi_nn_kernel_tensor_attr_t* attr[2]                = {NULL};

+    fp32_bit_cast_t fp32_bit_cast;
+    fp32_bit_cast.data = uint_min;
+    float_min = fp32_bit_cast.val;
+    fp32_bit_cast.data = uint_max;
+    float_max = fp32_bit_cast.val;
+
    status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[param_size - 5], &_is_ln );
    CHECK_STATUS_FAIL_GOTO(status, final );
    status = vsi_nn_kernel_scalar_read_int32( (vsi_nn_kernel_scalar_t)param[param_size - 4], &_is_cifg );
--- a/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c
@ -539,6 +539,7 @@ static vsi_nn_kernel_node_t _setup
    uint32_t pad_val = 0;
    int32_t align_corners =
        vsi_nn_kernel_param_get_int32(params, "align_corners");
+    int32_t pad_mode = vsi_nn_kernel_param_get_int32(params, "padding_mode");

    // Check if gpu can support the size
    if (!vsi_nn_kernel_gpu_check_shape(inputs[0]->attr.size,
@ -551,6 +552,11 @@ static vsi_nn_kernel_node_t _setup
        return NULL;
    }

+    if (pad_mode == VSI_NN_PAD_MODE_REFLECT)
+    {
+        return NULL;
+    }
+
    final_tensors[0] = inputs[0];

    if (inputs[1]->attr.dim_num >= 3) {
@ -596,8 +602,15 @@ static vsi_nn_kernel_node_t _setup
            {
                // Set default border mode.
                vx_border_t border;
-                border.mode = VX_BORDER_CONSTANT;
-                border.constant_value.U32 = pad_val;
+                if (pad_mode == VSI_NN_PAD_MODE_CONSTANT)
+                {
+                    border.mode = VX_BORDER_CONSTANT;
+                    border.constant_value.U32 = pad_val;
+                }
+                else
+                {
+                    border.mode = VX_BORDER_REPLICATE;
+                }
                status = vxSetNodeAttribute(
                    (vx_node)node, VX_NODE_BORDER, &border, sizeof(border));
                CHECK_STATUS(status);
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
@ -754,7 +754,7 @@ static vsi_nn_kernel_node_t _setup
    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size,
                outputs[0]->attr.dim_num ) )
    {
-        return NULL;
+        goto final;
    }

    if ( width == (int32_t)inputs[0]->attr.size[0] && height == (int32_t)inputs[0]->attr.size[1] &&
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c
@ -915,7 +915,7 @@ static vsi_nn_kernel_node_t _setup
    if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size,
                outputs[0]->attr.dim_num ) )
    {
-        return NULL;
+        goto final;
    }

    if ( width == (int32_t)inputs[0]->attr.size[0] && height == (int32_t)inputs[0]->attr.size[1] &&
--- a/src/tim/vx/internal/src/kernel/evis/resize_cubic_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_cubic_evis.c
@ -290,13 +290,14 @@ static vsi_nn_tensor_t* _create_scale_tensor
    vsi_nn_tensor_t*  scale           = NULL;
    vsi_size_t   i                    = 0;
    float       *scale_data_ptr       = NULL;
-    int         *index_data_ptr       = NULL;
+    int32_t     *index_data_ptr       = NULL;
    float        scale_value          = 0;
    vsi_ssize_t  data                 = 0;
-    int          idx                  = 0;
+    int32_t      idx                  = 0;
    float        delta_v              = 0;
    float        cubic_coeff_a        = -0.5f;
    vsi_size_t   item_count           = 4 * output_size;
+
    scale_data_ptr = (float *)malloc(item_count * sizeof(float));
    if (scale_data_ptr == NULL)
    {
@ -316,7 +317,7 @@ static vsi_nn_tensor_t* _create_scale_tensor
        scale_value = ((float)i + half_pixel_value) * scale_factor - half_pixel_value;
        data = (vsi_ssize_t)scale_value;
        delta_v = scale_value - (float)data;
-        idx   = (int)data - 1;
+        idx   = (int32_t)data - 1;

        index_data_ptr[i] = idx;
        scale_data_ptr[i * 4 + 0] = cubic_coeff_a * (((delta_v - 4) * (delta_v + 1) + 8) * (delta_v + 1) - 4);
@ -331,11 +332,6 @@ static vsi_nn_tensor_t* _create_scale_tensor
    attr.vtl = FALSE;

    scale = vsi_nn_CreateTensorFromData(graph, (uint8_t *)scale_data_ptr, &attr);
-    if (scale_data_ptr)
-    {
-        free (scale_data_ptr);
-        scale_data_ptr = NULL;
-    }

    attr.size[0] = output_size;
    attr.dim_num = 1;
@ -343,13 +339,11 @@ static vsi_nn_tensor_t* _create_scale_tensor
    attr.vtl = FALSE;

    *index = vsi_nn_CreateTensorFromData(graph, (uint8_t *)index_data_ptr, &attr);
-    if (index_data_ptr)
-    {
-        free (index_data_ptr);
-        index_data_ptr = NULL;
-    }

 OnError:
+    vsi_nn_safe_free(scale_data_ptr);
+    vsi_nn_safe_free(index_data_ptr);
+
    return scale;
 }

--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
@ -1218,6 +1218,11 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector
        status = backend->select( graph, inputs, input_num, outputs, output_num,
                params, &selector );
        VSI_ASSERT( status == VSI_SUCCESS );
+
+        if ( status != VSI_SUCCESS ) {
+            VSILOGW("Failed to select kernel \"%s\"", kernel_name);
+            return NULL;
+        }
    }
    else
    {
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
@ -171,7 +171,9 @@ vsi_status vsi_nn_kernel_copy_tensor_patch

    vsi_nn_kernel_tensor_attr_get_stride( attr, stride );
    memset(start, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM);
-    for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++)
+    memset(end, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM);
+
+    for (i = 0; i < (uint32_t)attr->shape->size; i++)
    {
        end[i] = attr->shape->data[i];
        if ( attr->dtype != I4 && attr->dtype != U4 )
@ -490,7 +492,7 @@ vsi_status vsi_nn_kernel_scalar_get_dtype
        ( vsi_nn_kernel_scalar_t scalar, DTYPE * ptr  ) \
    { \
        vsi_status status; \
-        vsi_nn_kernel_dtype_e dtype; \
+        vsi_nn_kernel_dtype_e dtype = INVALID_DTYPE; \
        if( !ptr ) \
        { \
            VSILOGE("Pointer to store scalar is null"); \
--- a/src/tim/vx/internal/src/kernel/vx/rms_norm_vx.c
+++ b/src/tim/vx/internal/src/kernel/vx/rms_norm_vx.c
@ -0,0 +1,94 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+
+#define REGISTER_RMS_NORM_OPENVX_KERNEL( kernel_name )   \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ); \
+    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        )
+
+REGISTER_RMS_NORM_OPENVX_KERNEL(rms_norm)
+{
+    vx_node node = NULL;
+
+#if (VX_RMS_NORM_VX_SUPPORT)
+    float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
+    int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
+    vx_tensor inputs_tensor[2] = {NULL};
+    vx_tensor output_tensor = NULL;
+
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(output_num);
+
+    inputs_tensor[0] = inputs[0]->t;
+    inputs_tensor[1] = inputs[1]->t;
+    output_tensor = outputs[0]->t;
+
+    node = vxRMSNormalizationLayer(
+        graph->g,
+        eps,
+        axis,
+        inputs_tensor,
+        (uint32_t)input_num,
+        output_tensor
+        );
+#else
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(graph);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(outputs);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(kernel);
+#endif
+
+    return (vsi_nn_kernel_node_t)node;
+} /* rms_norm() */
--- a/src/tim/vx/internal/src/libnnext/ops/cl/bilinear_grid_sample_reflect.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/bilinear_grid_sample_reflect.cl
@ -0,0 +1,169 @@
+__kernel void bilinear_grid_sample_reflect_F32_F32toF32(
+    __read_only  image2d_array_t  input0,
+    __read_only  image2d_t        input1,
+    __write_only image2d_array_t  output,
+                           float  half_input0_w,
+                           float  half_input0_h,
+                           float  add_float_value_w,
+                           float  add_float_value_h,
+                           int    depth,
+                           float  min_val_w,
+                           float  span_w,
+                           float  min_val_h,
+                           float  span_h
+                           )
+{
+    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int2   coord_in1    =  (int2)(get_global_id(0) * 2, get_global_id(1));
+    int2   coord_add    = (int2)(-1, 1);
+
+    float fx = read_imagef(input1, coord_in1).x;
+    coord_in1.x = coord_in1.x + 1;
+    float fy = read_imagef(input1, coord_in1).x;
+
+    fx = fx * half_input0_w + add_float_value_w;
+    fy = fy * half_input0_h + add_float_value_h;
+
+    if (span_w > 0)
+    {
+        fx = fabs(fx - min_val_w);
+        int flips_x = (int)(fx / span_w);
+        float extra_x = fx - flips_x * span_w;
+        fx = (flips_x & 0x01) ?  min_val_w + (span_w - extra_x) : min_val_w + extra_x ;
+    }
+    else
+    {
+        fx = 0;
+    }
+
+    if (span_h > 0)
+    {
+        fy = fabs(fy - min_val_h);
+        int flips_y = (int)(fy / span_h);
+        float extra_y = fy - flips_y * span_h;
+        fy = (flips_y & 0x01) ?  min_val_h + (span_h - extra_y) : min_val_h + extra_y ;
+    }
+    else
+    {
+        fy = 0;
+    }
+
+    float x_f = floor(fx);
+    float y_f = floor(fy);
+    float x_lerp  = fx - x_f;
+    float y_lerp  = fy - y_f;
+    int   x_index = convert_int(x_f);
+    int   y_index = convert_int(y_f);
+    int4   coord_in     = (int4)(x_index, y_index, 0, 0);
+
+    float4 top_l, top_r, bottom_l, bottom_r, top, bottom, dst;
+
+    while (coord_in.z < depth){
+        top_l    = read_imagef(input0, coord_in);
+        coord_in.y++;
+        bottom_l = read_imagef(input0, coord_in);
+        coord_in.x++;
+        bottom_r = read_imagef(input0, coord_in);
+        coord_in.y--;
+        top_r    = read_imagef(input0, coord_in);
+        top_r    = top_r - top_l;
+        top      = top_l + x_lerp * top_r;
+        bottom_r = bottom_r - bottom_l;
+        bottom   = bottom_l + x_lerp * bottom_r;
+        bottom   = bottom - top;
+        dst      = top + y_lerp * bottom;
+        write_imagef(output, coord_out, dst);
+        coord_in.xz = coord_in.xz + coord_add;
+        coord_out.z++;
+    }
+}
+
+
+__kernel void bilinear_grid_sample_reflect_U8_U8toU8(
+    __read_only  image2d_array_t  input0,
+    __read_only  image2d_t        input1,
+    __write_only image2d_array_t  output,
+                           float  half_input0_w,
+                           float  half_input0_h,
+                           float  add_float_value_w,
+                           float  add_float_value_h,
+                           int    depth,
+                           float  in0_scale,
+                           float  in0_tail,
+                           float  in1_scale,
+                           float  in1_tail,
+                           float  out_scale,
+                           float  out_tail,
+                           float  min_val_w,
+                           float  span_w,
+                           float  min_val_h,
+                           float  span_h
+                           )
+{
+    int4   coord_out    =  (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int2   coord_in1    =  (int2)(get_global_id(0) * 2, get_global_id(1));
+    int2   coord_add    = (int2)(-1, 1);
+
+    float fx    = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;
+    coord_in1.x = coord_in1.x + 1;
+    float fy    = convert_float4(read_imageui(input1, coord_in1)).x * in1_scale + in1_tail;
+
+    fx = fx * half_input0_w + add_float_value_w;
+    fy = fy * half_input0_h + add_float_value_h;
+
+    if (span_w > 0)
+    {
+        fx = fabs(fx - min_val_w);
+        int flips_x = (int)(fx / span_w);
+        float extra_x = fx - flips_x * span_w;
+        fx = (flips_x & 0x01) ?  min_val_w + (span_w - extra_x) : min_val_w + extra_x ;
+    }
+    else
+    {
+        fx = 0;
+    }
+
+    if (span_h > 0)
+    {
+        fy = fabs(fy - min_val_h);
+        int flips_y = (int)(fy / span_h);
+        float extra_y = fy - flips_y * span_h;
+        fy = (flips_y & 0x01) ?  min_val_h + (span_h - extra_y) : min_val_h + extra_y ;
+    }
+    else
+    {
+        fy = 0;
+    }
+
+    float x_f = floor(fx);
+    float y_f = floor(fy);
+    float x_lerp  = fx - x_f;
+    float y_lerp  = fy - y_f;
+    int   x_index = convert_int(x_f);
+    int   y_index = convert_int(y_f);
+    int4   coord_in     = (int4)(x_index, y_index, 0, 0);
+
+    float4 top_l, top_r, bottom_l, bottom_r, top, bottom;
+    uint4  dst;
+
+    while (coord_in.z < depth){
+        top_l    = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;
+        coord_in.y++;
+        bottom_l = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;
+        coord_in.x++;
+        bottom_r = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;
+        coord_in.y--;
+        top_r    = convert_float4(read_imageui(input0, coord_in)) * in0_scale + in0_tail;
+        top_r    = top_r - top_l;
+        top      = top_l + x_lerp * top_r;
+        bottom_r = bottom_r - bottom_l;
+        bottom   = bottom_l + x_lerp * bottom_r;
+        bottom   = bottom - top;
+        top      = top + y_lerp * bottom;
+        dst      = convert_uint4_rte(top * out_scale + out_tail);
+        write_imageui(output, coord_out, dst);
+        coord_in.xz = coord_in.xz + coord_add;
+        coord_out.z++;
+    }
+
+}
--- a/src/tim/vx/internal/src/libnnext/ops/cl/maxpool.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/maxpool.cl
@ -1,4 +1,4 @@
-#define VSI_FLOAT32_MIN     (1.175494351e-38F)
+#define VSI_FLOAT32_MIN     (-3.40E+38)

 #define MAXPOOL_QINT(in_name, out_name, src_type, dst_type, max_val, read_func, write_func, conv_func) \
 __kernel void maxpool_##in_name##to##out_name( \
@ -45,7 +45,7 @@ __kernel void maxpool_##in_name##to##out_name( \
        { \
            src0 = read_func(input, coord_in); \
            coord_in.x += dilation_x; \
-            maxVal = max(src0, maxVal); \
+            maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x; \
        } \
    } \
 \
@ -101,7 +101,7 @@ __kernel void maxpool_F32toF32(
        {
            src0 = read_imagef(input, coord_in);
            coord_in.x += dilation_x;
-            maxVal = max(src0, maxVal);
+            maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x;
        }
    }

@ -152,7 +152,7 @@ __kernel void maxpool_U32toF32(
        {
            src0 = read_imageui(input, coord_in);
            coord_in.x += dilation_x;
-            maxVal = max(src0, maxVal);
+            maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x;
        }
    }

@ -206,7 +206,7 @@ __kernel void maxpool_F32toU32(
        {
            src0 = read_imagef(input, coord_in);
            coord_in.x += dilation_x;
-            maxVal = max(src0, maxVal);
+            maxVal.x = src0.x > maxVal.x ? src0.x : maxVal.x;
        }
    }

--- a/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
@ -51,7 +51,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            float left_elem = local_data[left_id]; \
            float right_elem = local_data[right_id]; \
 \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
            { \
                local_data[left_id] = right_elem; \
                local_data[right_id] = left_elem; \
@ -139,7 +139,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            uint left_elem = local_data[left_id]; \
            uint right_elem = local_data[right_id]; \
 \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
            { \
                local_data[left_id] = right_elem; \
                local_data[right_id] = left_elem; \
@ -227,7 +227,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            int left_elem = local_data[left_id]; \
            int right_elem = local_data[right_id]; \
 \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
            { \
                local_data[left_id] = right_elem; \
                local_data[right_id] = left_elem; \
@ -315,7 +315,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            float left_elem = local_data[left_id]; \
            float right_elem = local_data[right_id]; \
 \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
            { \
                local_data[left_id] = right_elem; \
                local_data[right_id] = left_elem; \
@ -403,7 +403,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
            float left_elem = local_data[left_id]; \
            float right_elem = local_data[right_id]; \
 \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
            { \
                local_data[left_id] = right_elem; \
                local_data[right_id] = left_elem; \
--- a/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl
@ -0,0 +1,368 @@
+
+#define BITONIC_STEP(dtype) \
+void bitonic_step_##dtype(uint num_stages, int lx, \
+        __local dtype *local_data, __local int *local_indices) \
+{ \
+    for (uint stage = 0; stage < num_stages + 1; ++stage) \
+    { \
+        uint signo = (lx >> stage) & 1; \
+ \
+        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
+        { \
+            uint postShift = (stage - passOfStage); \
+            uint pairDistance = 1 << postShift; \
+ \
+            uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
+            uint right_id = left_id + pairDistance; \
+ \
+            int left_idx = local_indices[left_id]; \
+            int right_idx = local_indices[right_id]; \
+ \
+            dtype left_elem = local_data[left_id]; \
+            dtype right_elem = local_data[right_id]; \
+ \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
+            { \
+                local_data[left_id] = right_elem; \
+                local_data[right_id] = left_elem; \
+ \
+                local_indices[left_id] = right_idx; \
+                local_indices[right_id] = left_idx; \
+            } \
+ \
+            barrier(CLK_LOCAL_MEM_FENCE); \
+        } \
+    } \
+}
+BITONIC_STEP(int)
+BITONIC_STEP(uint)
+
+#define BITONIC_STEP_ASCEND(dtype) \
+void bitonic_step_ascend_##dtype(uint num_stages, int lx, \
+        __local dtype *p_share_k, __local int *p_share_v) \
+{ \
+    for (uint stage = 0; stage < num_stages + 1; ++stage) \
+    { \
+        uint signo = (lx >> stage) & 1; \
+ \
+        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
+        { \
+            uint postShift = (stage - passOfStage); \
+            uint pairDistance = 1 << postShift; \
+ \
+            uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
+            uint right_id = left_id + pairDistance; \
+ \
+            int left_idx = p_share_v[left_id]; \
+            int right_idx = p_share_v[right_id]; \
+ \
+            dtype left_elem = p_share_k[left_id]; \
+            dtype right_elem = p_share_k[right_id]; \
+ \
+            if ((left_elem > right_elem || (left_elem == right_elem && left_idx > right_idx)) ^ signo) \
+            { \
+                p_share_k[left_id] = right_elem; \
+                p_share_k[right_id] = left_elem; \
+ \
+                p_share_v[left_id] = right_idx; \
+                p_share_v[right_id] = left_idx; \
+            } \
+ \
+            barrier(CLK_LOCAL_MEM_FENCE); \
+        } \
+    } \
+}
+BITONIC_STEP_ASCEND(int)
+BITONIC_STEP_ASCEND(uint)
+
+#define BITONIC_MERGE(dtype) \
+void bitonic_merge_##dtype(uint num_stages, int lx, \
+        __local dtype *local_data, __local int *local_indices) \
+{ \
+    uint stage = num_stages; \
+    uint signo = (lx >> stage) & 1; \
+ \
+    for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
+    { \
+        uint postShift = (stage - passOfStage); \
+        uint pairDistance = 1 << postShift; \
+ \
+        uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
+        uint right_id = left_id + pairDistance; \
+ \
+        int left_idx = local_indices[left_id]; \
+        int right_idx = local_indices[right_id]; \
+ \
+        dtype left_elem = local_data[left_id]; \
+        dtype right_elem = local_data[right_id]; \
+ \
+        if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
+        { \
+            local_data[left_id] = right_elem; \
+            local_data[right_id] = left_elem; \
+ \
+            local_indices[left_id] = right_idx; \
+            local_indices[right_id] = left_idx; \
+        } \
+ \
+        barrier(CLK_LOCAL_MEM_FENCE); \
+    } \
+}
+BITONIC_MERGE(int)
+BITONIC_MERGE(uint)
+
+#define BLOCK_SIZE              (512)
+
+__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_I32toI32_I32
+(
+  __read_only  image2d_t input,
+  __write_only image2d_t output,
+  __write_only image2d_t indices,
+               float     input_scale,
+               float     input_tail,
+               float     output_scale,
+               float     output_tail,
+               int       _num_stages,
+               int       width
+  )
+ {
+    uint lx = get_local_id(0);
+    const int init_k = -2147483647;
+    const int init_v = -2147483647;
+    const int num_stages = 9;
+    const int threads_per_block = BLOCK_SIZE;
+    const int index_minus_1 = threads_per_block * 2 - 1;
+    uint offset = 0;
+    uint lx1 = lx + threads_per_block;
+
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    __local int local_data[1536];
+    __local int local_indices[1536];
+
+    int left = read_imagei(input, coord.xy).x;
+    coord.z += threads_per_block;
+    int right = read_imagei(input, coord.zy).x;
+
+    local_data[lx] = left;
+    local_indices[lx] = coord.x;
+    local_data[lx1] = right;
+    local_indices[lx1] = coord.z;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    bitonic_step_int(num_stages, lx, local_data, local_indices);
+
+    int min_data = local_data[511];
+
+    int *p_share_k = local_data + threads_per_block;
+    int *p_share_v = local_indices + threads_per_block;
+
+    int limit = (width >> 10) << 10;
+    p_share_k[lx] = init_k;
+    p_share_v[lx] = init_v;
+
+    p_share_k[lx1] = init_k;
+    p_share_v[lx1] = init_v;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)
+    {
+        int2 data;
+        coord.z = coord.x + threads_per_block;
+        data.x = read_imagei(input, coord.xy).x;
+        data.y = read_imagei(input, coord.zy).x;
+
+        p_share_k[lx] = data.x;
+        p_share_v[lx] = coord.x;
+
+        p_share_k[lx1] = data.y;
+        p_share_v[lx1] = coord.z;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);
+
+        if (p_share_k[index_minus_1] < min_data)
+        {
+            continue;
+        }
+
+        p_share_k[lx] = p_share_k[lx1];
+        p_share_v[lx] = p_share_v[lx1];
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_merge_int(num_stages, lx, local_data, local_indices);
+
+        min_data = local_data[511];
+        p_share_k[lx] = init_k;
+        p_share_v[lx] = init_v;
+        p_share_k[lx1] = init_k;
+        p_share_v[lx1] = init_v;
+    }
+
+    if (width > limit)
+    {
+        if (coord.x < width)
+        {
+            int2 data;
+            data.x = read_imagei(input, coord.xy).x;
+            coord.z = coord.x + threads_per_block;
+            data.y = read_imagei(input, coord.zy).x;
+
+            p_share_k[lx] = data.x;
+            p_share_v[lx] = coord.x;
+
+            p_share_k[lx1] = coord.z < width ? data.y : init_k;
+            p_share_v[lx1] = coord.z < width ? coord.z : init_v;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);
+
+        if (p_share_k[index_minus_1] >= min_data)
+        {
+            p_share_k[lx] = p_share_k[lx1];
+            p_share_v[lx] = p_share_v[lx1];
+            barrier(CLK_LOCAL_MEM_FENCE);
+            bitonic_merge_int(num_stages, lx, local_data, local_indices);
+        }
+    }
+
+    int4 dst;
+    dst.x = local_data[lx];
+
+    coord.x = lx;
+    write_imagei(output, coord.xy, dst.xxxx);
+
+    int4 index;
+    index.x = local_indices[lx];
+
+    write_imagei(indices, coord.xy, index.xxxx);
+}
+
+__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_U32toU32_I32
+(
+  __read_only  image2d_t input,
+  __write_only image2d_t output,
+  __write_only image2d_t indices,
+               float     input_scale,
+               float     input_tail,
+               float     output_scale,
+               float     output_tail,
+               int       _num_stages,
+               int       width
+  )
+ {
+    uint lx = get_local_id(0);
+    const uint init_k = 0;
+    const int init_v = -2147483647;
+    const int num_stages = 9;
+    const int threads_per_block = BLOCK_SIZE;
+    const int index_minus_1 = threads_per_block * 2 - 1;
+    uint offset = 0;
+    uint lx1 = lx + threads_per_block;
+
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    __local uint local_data[1536];
+    __local int local_indices[1536];
+
+    uint left = read_imageui(input, coord.xy).x;
+    coord.z += threads_per_block;
+    uint right = read_imageui(input, coord.zy).x;
+
+    local_data[lx] = left;
+    local_indices[lx] = coord.x;
+    local_data[lx1] = right;
+    local_indices[lx1] = coord.z;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    bitonic_step_uint(num_stages, lx, local_data, local_indices);
+
+    uint min_data = local_data[511];
+
+    uint *p_share_k = local_data + threads_per_block;
+    int *p_share_v = local_indices + threads_per_block;
+
+    int limit = (width >> 10) << 10;
+    p_share_k[lx] = init_k;
+    p_share_v[lx] = init_v;
+
+    p_share_k[lx1] = init_k;
+    p_share_v[lx1] = init_v;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)
+    {
+        uint2 data;
+        coord.z = coord.x + threads_per_block;
+        data.x = read_imageui(input, coord.xy).x;
+        data.y = read_imageui(input, coord.zy).x;
+
+        p_share_k[lx] = data.x;
+        p_share_v[lx] = coord.x;
+
+        p_share_k[lx1] = data.y;
+        p_share_v[lx1] = coord.z;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);
+
+        if (p_share_k[index_minus_1] < min_data)
+        {
+            continue;
+        }
+
+        p_share_k[lx] = p_share_k[lx1];
+        p_share_v[lx] = p_share_v[lx1];
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_merge_uint(num_stages, lx, local_data, local_indices);
+
+        min_data = local_data[511];
+        p_share_k[lx] = init_k;
+        p_share_v[lx] = init_v;
+        p_share_k[lx1] = init_k;
+        p_share_v[lx1] = init_v;
+    }
+
+    if (width > limit)
+    {
+        if (coord.x < width)
+        {
+            uint2 data;
+            data.x = read_imageui(input, coord.xy).x;
+            coord.z = coord.x + threads_per_block;
+            data.y = read_imageui(input, coord.zy).x;
+
+            p_share_k[lx] = data.x;
+            p_share_v[lx] = coord.x;
+
+            p_share_k[lx1] = coord.z < width ? data.y : init_k;
+            p_share_v[lx1] = coord.z < width ? coord.z : init_v;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);
+
+        if (p_share_k[index_minus_1] >= min_data)
+        {
+            p_share_k[lx] = p_share_k[lx1];
+            p_share_v[lx] = p_share_v[lx1];
+            barrier(CLK_LOCAL_MEM_FENCE);
+            bitonic_merge_uint(num_stages, lx, local_data, local_indices);
+        }
+    }
+
+    uint4 dst;
+    dst.x = local_data[lx];
+
+    coord.x = lx;
+    write_imageui(output, coord.xy, dst.xxxx);
+
+    int4 index;
+    index.x = local_indices[lx];
+
+    write_imagei(indices, coord.xy, index.xxxx);
+}
--- a/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort.cl
@ -28,12 +28,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
    }

    __local int sorted[1];
-    int width_minus_one = width - 1;
-    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
-    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
-
-    int x_start = lid * num_pixels_per_thread;
-    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);

    sorted[0] = 0;

@ -44,20 +38,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
            *sorted = 0;
        }
        int swapped = 0;
-        barrier(CLK_GLOBAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);

        // odd-even
-        coord.x = x_start;
-        coord.z = x_start + 1;
-        for (; coord.x < x_end; )
+        coord.x = lid * 2;
+        coord.z = lid * 2 + 1;
+        for (; coord.z < width; )
        {
            float4 left = read_imagef(input_t, coord.xy);
            float4 right = read_imagef(input_t, coord.zy);
+            int4 l_index = read_imagei(indices_t, coord.xy);
+            int4 r_index = read_imagei(indices_t, coord.zy);

-            if (left.x < right.x)
+            if ( (left.x < right.x) ||
+                (left.x == right.x && l_index.x < r_index.x) )
            {
-                int4 l_index = read_imagei(indices_t, coord.xy);
-                int4 r_index = read_imagei(indices_t, coord.zy);
                swapped = 1;

                write_imagef(input_t, coord.xy, right);
@ -67,21 +62,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                write_imagei(indices_t, coord.zy, l_index);
            }

-            coord.xz = coord.xz + 2;
+            coord.xz += 2 * LOCAL_SIZE_X;
        }

+        barrier(CLK_GLOBAL_MEM_FENCE);
        // even-odd
-        coord.x = x_start + 1;
-        coord.z = x_start + 2;
-        for (; coord.x < x_end; )
+        coord.x = lid * 2 + 1;
+        coord.z = lid * 2 + 2;
+        for (; coord.z < width; )
        {
            float4 left = read_imagef(input_t, coord.xy);
            float4 right = read_imagef(input_t, coord.zy);
+            int4 l_index = read_imagei(indices_t, coord.xy);
+            int4 r_index = read_imagei(indices_t, coord.zy);

-            if (left.x < right.x)
+            if ( (left.x < right.x) ||
+                (left.x == right.x && l_index.x < r_index.x) )
            {
-                int4 l_index = read_imagei(indices_t, coord.xy);
-                int4 r_index = read_imagei(indices_t, coord.zy);
                swapped = 1;

                write_imagef(input_t, coord.xy, right);
@ -91,11 +88,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                write_imagei(indices_t, coord.zy, l_index);
            }

-            coord.xz = coord.xz + 2;
+            coord.xz += 2 * LOCAL_SIZE_X;
        }

        atomic_add(sorted, swapped);
-        barrier(CLK_GLOBAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);

        if (*sorted == 0)
            break;
@ -141,13 +138,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
    }

    __local int sorted[1];
-    int width_minus_one = width - 1;
-    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
-    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
-
-    int x_start = lid * num_pixels_per_thread;
-    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);
-
    sorted[0] = 0;

    while (1)
@ -157,20 +147,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
            *sorted = 0;
        }
        int swapped = 0;
-        barrier(CLK_GLOBAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);

        // odd-even
-        coord.x = x_start;
-        coord.z = x_start + 1;
-        for (; coord.x < x_end; )
+        coord.x = lid * 2;
+        coord.z = lid * 2 + 1;
+        for (; coord.z < width; )
        {
            uint4 left = read_imageui(input_t, coord.xy);
            uint4 right = read_imageui(input_t, coord.zy);
+            int4 l_index = read_imagei(indices_t, coord.xy);
+            int4 r_index = read_imagei(indices_t, coord.zy);

-            if (left.x < right.x)
+            if ( (left.x < right.x) ||
+                (left.x == right.x && l_index.x < r_index.x) )
            {
-                int4 l_index = read_imagei(indices_t, coord.xy);
-                int4 r_index = read_imagei(indices_t, coord.zy);
                swapped = 1;

                write_imageui(input_t, coord.xy, right);
@ -180,21 +171,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                write_imagei(indices_t, coord.zy, l_index);
            }

-            coord.xz = coord.xz + 2;
+            coord.xz += 2 * LOCAL_SIZE_X;
        }

+        barrier(CLK_GLOBAL_MEM_FENCE);
        // even-odd
-        coord.x = x_start + 1;
-        coord.z = x_start + 2;
-        for (; coord.x < x_end; )
+        coord.x = lid * 2 + 1;
+        coord.z = lid * 2 + 2;
+        for (; coord.z < width; )
        {
            uint4 left = read_imageui(input_t, coord.xy);
            uint4 right = read_imageui(input_t, coord.zy);
+            int4 l_index = read_imagei(indices_t, coord.xy);
+            int4 r_index = read_imagei(indices_t, coord.zy);

-            if (left.x < right.x)
+            if ( (left.x < right.x) ||
+                (left.x == right.x && l_index.x < r_index.x) )
            {
-                int4 l_index = read_imagei(indices_t, coord.xy);
-                int4 r_index = read_imagei(indices_t, coord.zy);
                swapped = 1;

                write_imageui(input_t, coord.xy, right);
@ -204,11 +197,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                write_imagei(indices_t, coord.zy, l_index);
            }

-            coord.xz = coord.xz + 2;
+            coord.xz += 2 * LOCAL_SIZE_X;
        }

        atomic_add(sorted, swapped);
-        barrier(CLK_GLOBAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);

        if (*sorted == 0)
            break;
@ -254,13 +247,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
    }

    __local int sorted[1];
-    int width_minus_one = width - 1;
-    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
-    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
-
-    int x_start = lid * num_pixels_per_thread;
-    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);
-
    sorted[0] = 0;

    while (1)
@ -270,20 +256,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
            *sorted = 0;
        }
        int swapped = 0;
-        barrier(CLK_GLOBAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);

        // odd-even
-        coord.x = x_start;
-        coord.z = x_start + 1;
-        for (; coord.x < x_end; )
+        coord.x = lid * 2;
+        coord.z = lid * 2 + 1;
+        for (; coord.z < width; )
        {
            int4 left = read_imagei(input_t, coord.xy);
            int4 right = read_imagei(input_t, coord.zy);
+            int4 l_index = read_imagei(indices_t, coord.xy);
+            int4 r_index = read_imagei(indices_t, coord.zy);

-            if (left.x < right.x)
+            if ( (left.x < right.x) ||
+                (left.x == right.x && l_index.x < r_index.x) )
            {
-                int4 l_index = read_imagei(indices_t, coord.xy);
-                int4 r_index = read_imagei(indices_t, coord.zy);
                swapped = 1;

                write_imagei(input_t, coord.xy, right);
@ -293,21 +280,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                write_imagei(indices_t, coord.zy, l_index);
            }

-            coord.xz = coord.xz + 2;
+            coord.xz += 2 * LOCAL_SIZE_X;
        }

+        barrier(CLK_GLOBAL_MEM_FENCE);
        // even-odd
-        coord.x = x_start + 1;
-        coord.z = x_start + 2;
-        for (; coord.x < x_end; )
+        coord.x = lid * 2 + 1;
+        coord.z = lid * 2 + 2;
+        for (; coord.z < width; )
        {
            int4 left = read_imagei(input_t, coord.xy);
            int4 right = read_imagei(input_t, coord.zy);
+            int4 l_index = read_imagei(indices_t, coord.xy);
+            int4 r_index = read_imagei(indices_t, coord.zy);

-            if (left.x < right.x)
+            if ( (left.x < right.x) ||
+                (left.x == right.x && l_index.x < r_index.x) )
            {
-                int4 l_index = read_imagei(indices_t, coord.xy);
-                int4 r_index = read_imagei(indices_t, coord.zy);
                swapped = 1;

                write_imagei(input_t, coord.xy, right);
@ -317,11 +306,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                write_imagei(indices_t, coord.zy, l_index);
            }

-            coord.xz = coord.xz + 2;
+            coord.xz += 2 * LOCAL_SIZE_X;
        }

        atomic_add(sorted, swapped);
-        barrier(CLK_GLOBAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);

        if (*sorted == 0)
            break;
--- a/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort2.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk_odd_even_sort2.cl
@ -28,12 +28,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
    }

    __local int sorted[1];
-    int width_minus_one = width - 1;
-    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
-    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
-
-    int x_start = lid * num_pixels_per_thread;
-    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);

    sorted[0] = 0;

@ -44,20 +38,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
            *sorted = 0;
        }
        int swapped = 0;
-        barrier(CLK_GLOBAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);

        // odd-even
-        coord.x = x_start;
-        coord.z = x_start + 1;
-        for (; coord.x < x_end; )
+        coord.x = lid * 2;
+        coord.z = lid * 2 + 1;
+        for (; coord.z < width; )
        {
            float4 left = read_imagef(input_t, coord.xy);
            float4 right = read_imagef(input_t, coord.zy);
+            int4 l_index = read_imagei(indices_t, coord.xy);
+            int4 r_index = read_imagei(indices_t, coord.zy);

-            if (left.x < right.x)
+            if ( (left.x < right.x) ||
+                (left.x == right.x && l_index.x < r_index.x) )
            {
-                int4 l_index = read_imagei(indices_t, coord.xy);
-                int4 r_index = read_imagei(indices_t, coord.zy);
                swapped = 1;

                write_imagef(input_t, coord.xy, right);
@ -67,21 +62,23 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                write_imagei(indices_t, coord.zy, l_index);
            }

-            coord.xz = coord.xz + 2;
+            coord.xz += 2 * LOCAL_SIZE_X;
        }

+        barrier(CLK_GLOBAL_MEM_FENCE);
        // even-odd
-        coord.x = x_start + 1;
-        coord.z = x_start + 2;
-        for (; coord.x < x_end; )
+        coord.x = lid * 2 + 1;
+        coord.z = lid * 2 + 2;
+        for (; coord.z < width; )
        {
            float4 left = read_imagef(input_t, coord.xy);
            float4 right = read_imagef(input_t, coord.zy);
+            int4 l_index = read_imagei(indices_t, coord.xy);
+            int4 r_index = read_imagei(indices_t, coord.zy);

-            if (left.x < right.x)
+            if ( (left.x < right.x) ||
+                (left.x == right.x && l_index.x < r_index.x) )
            {
-                int4 l_index = read_imagei(indices_t, coord.xy);
-                int4 r_index = read_imagei(indices_t, coord.zy);
                swapped = 1;

                write_imagef(input_t, coord.xy, right);
@ -91,11 +88,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                write_imagei(indices_t, coord.zy, l_index);
            }

-            coord.xz = coord.xz + 2;
+            coord.xz += 2 * LOCAL_SIZE_X;
        }

        atomic_add(sorted, swapped);
-        barrier(CLK_GLOBAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);

        if (*sorted == 0)
            break;
@ -143,13 +140,6 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
    }

    __local int sorted[1];
-    int width_minus_one = width - 1;
-    int num_pixels_per_thread = (width_minus_one + LOCAL_SIZE_X) / LOCAL_SIZE_X;
-    num_pixels_per_thread = num_pixels_per_thread + (num_pixels_per_thread & 1);
-
-    int x_start = lid * num_pixels_per_thread;
-    int x_end = min(lid * num_pixels_per_thread + num_pixels_per_thread, width_minus_one);
-
    sorted[0] = 0;

    while (1)
@ -159,20 +149,21 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
            *sorted = 0;
        }
        int swapped = 0;
-        barrier(CLK_GLOBAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);

        // odd-even
-        coord.x = x_start;
-        coord.z = x_start + 1;
-        for (; coord.x < x_end; )
+        coord.x = lid * 2;
+        coord.z = lid * 2 + 1;
+        for (; coord.z < width; )
        {
            float4 left = read_imagef(input_t, coord.xy);
            float4 right = read_imagef(input_t, coord.zy);
+            int4 l_index = read_imagei(indices_t, coord.xy);
+            int4 r_index = read_imagei(indices_t, coord.zy);

-            if (left.x < right.x)
+            if ( (left.x < right.x) ||
+                (left.x == right.x && l_index.x < r_index.x) )
            {
-                int4 l_index = read_imagei(indices_t, coord.xy);
-                int4 r_index = read_imagei(indices_t, coord.zy);
                swapped = 1;

                write_imagef(input_t, coord.xy, right);
@ -182,18 +173,22 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                write_imagei(indices_t, coord.zy, l_index);
            }

-            coord.xz = coord.xz + 2;
+            coord.xz += 2 * LOCAL_SIZE_X;
        }

+        barrier(CLK_GLOBAL_MEM_FENCE);
        // even-odd
-        coord.x = x_start + 1;
-        coord.z = x_start + 2;
-        for (; coord.x < x_end; )
+        coord.x = lid * 2 + 1;
+        coord.z = lid * 2 + 2;
+        for (; coord.z < width; )
        {
            float4 left = read_imagef(input_t, coord.xy);
            float4 right = read_imagef(input_t, coord.zy);
+            int4 l_index = read_imagei(indices_t, coord.xy);
+            int4 r_index = read_imagei(indices_t, coord.zy);

-            if (left.x < right.x)
+            if ( (left.x < right.x) ||
+                (left.x == right.x && l_index.x < r_index.x) )
            {
                int4 l_index = read_imagei(indices_t, coord.xy);
                int4 r_index = read_imagei(indices_t, coord.zy);
@ -206,11 +201,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd
                write_imagei(indices_t, coord.zy, l_index);
            }

-            coord.xz = coord.xz + 2;
+            coord.xz += 2 * LOCAL_SIZE_X;
        }

        atomic_add(sorted, swapped);
-        barrier(CLK_GLOBAL_MEM_FENCE);
+        barrier(CLK_LOCAL_MEM_FENCE);

        if (*sorted == 0)
            break;
--- a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_BF16_to_BF16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_BF16_to_BF16.vx
@ -0,0 +1,171 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+_viv_uniform VXC_512Bits uniConvBF16toF32_even_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_odd_2x8;
+
+_viv_uniform VXC_512Bits uniBF16toFp32_part0_2x8;
+_viv_uniform VXC_512Bits uniBF16toFp32_part1_2x8;
+_viv_uniform float4 span_wh;
+_viv_uniform float4 min_val_wh;
+
+#define GRID_SAMPLE_BF16_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy0 = fabs(fxy0 - min_val_wh); \
+    fxy1 = fabs(fxy1 - min_val_wh); \
+    float4 flips_xy0 = floor(fxy0 / span_wh); \
+    float4 flips_xy1 = floor(fxy1 / span_wh); \
+    float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
+    float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
+    int4 flips_int_xy0 = convert_int4(flips_xy0); \
+    int4 flips_int_xy1 = convert_int4(flips_xy1); \
+    fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
+    fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    float4 x_f         = floor(in_x); \
+    float4 x_lerp      = in_x - x_f; \
+    int4   x_idx       = convert_int4(x_f); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    float4  y_f         = floor(in_y); \
+    float4  y_lerp       = in_y - y_f; \
+    int4    y_idx        = convert_int4(y_f); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_short8 top; \
+    vxc_short8 bottom; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    float4 left4; \
+    float4 right4; \
+    float4 top4; \
+    float4 bottom4; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    vxc_ushort8 tmp, dst; \
+    while (coord_in.z < loop) \
+    { \
+        VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \
+        _viv_asm(COPY, right4, src, 16); \
+        VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \
+        _viv_asm(COPY, left4,  src, 16); \
+        right4    -= left4; \
+        top4        = right4 * x_lerp + left4; \
+        VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \
+        _viv_asm(COPY, right4, src, 16); \
+        VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \
+        _viv_asm(COPY, left4,  src, 16); \
+        right4    -= left4; \
+        bottom4      = right4 * x_lerp + left4; \
+        bottom4     -= top4; \
+        float4 dst4  = bottom4 * y_lerp + top4; \
+        _viv_asm(COPY, tmp, dst4, 16); \
+        dst.s0123 = tmp.s1357; \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+    VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \
+    _viv_asm(COPY, right4, src, 16); \
+    VXC_DP2x8(src, top, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \
+    _viv_asm(COPY, left4,  src, 16); \
+    right4    -= left4; \
+    top4        = right4 * x_lerp + left4; \
+    VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_even_2x8); \
+    _viv_asm(COPY, right4, src, 16); \
+    VXC_DP2x8(src, bottom, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_odd_2x8); \
+    _viv_asm(COPY, left4,  src, 16); \
+    right4    -= left4; \
+    bottom4      = right4 * x_lerp + left4; \
+    bottom4     -= top4; \
+    float4 dst4  = bottom4 * y_lerp + top4; \
+    _viv_asm(COPY, tmp, dst4, 16); \
+    dst.s0123 = tmp.s1357; \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, dst, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+
+
+__kernel void bilinear_grid_sample_reflect_BF16_BF16toBF16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_short8 read_val;
+    vxc_short8 zero = (vxc_short8)(0, 0, 0, 0, 0, 0, 0, 0);
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+
+    float4 fxy0;
+    float4 fxy1;
+
+    vxc_short8 src;
+    VXC_DP2x8(src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part0_2x8);
+    _viv_asm(COPY, fxy0, src, 16);
+    VXC_DP2x8(src, read_val, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniBF16toFp32_part1_2x8);
+    _viv_asm(COPY, fxy1, src, 16);
+
+
+
+    GRID_SAMPLE_BF16_PROCESS();
+
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_F16_to_F16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_F16_to_F16.vx
@ -0,0 +1,217 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4;
+_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4;
+_viv_uniform VXC_512Bits uniExtactHalf8_2x8;
+_viv_uniform float4 span_wh;
+_viv_uniform float4 min_val_wh;
+
+#define GRID_SAMPLE_F16_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy0 = fabs(fxy0 - min_val_wh); \
+    fxy1 = fabs(fxy1 - min_val_wh); \
+    float4 flips_xy0 = floor(fxy0 / span_wh); \
+    float4 flips_xy1 = floor(fxy1 / span_wh); \
+    float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
+    float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
+    int4 flips_int_xy0 = convert_int4(flips_xy0); \
+    int4 flips_int_xy1 = convert_int4(flips_xy1); \
+    fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
+    fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    float4 x_f         = floor(in_x); \
+    float4 x_lerp      = in_x - x_f; \
+    int4   x_idx       = convert_int4(x_f); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    float4  y_f         = floor(in_y); \
+    float4  y_lerp       = in_y - y_f; \
+    int4    y_idx        = convert_int4(y_f); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_short8 t0; \
+    vxc_short8 b0; \
+    vxc_half8 top; \
+    vxc_half8 bottom; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, top, t0, 16); \
+    _viv_asm(COPY, bottom, b0, 16); \
+    float4 left4; \
+    float4 right4; \
+    float4 top4; \
+    float4 bottom4; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    while (coord_in.z < loop) \
+    { \
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+        top4        = right4 * x_lerp + left4; \
+        VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+        VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+        bottom4      = right4 * x_lerp + left4; \
+        bottom4     -= top4; \
+        float4 dst4  = bottom4 * y_lerp + top4; \
+        half4 tmp; \
+        _viv_asm(CONV, tmp, dst4); \
+        VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); \
+        vxc_short4 result; \
+        _viv_asm(COPY, result, top, 8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, top, t0, 16); \
+        _viv_asm(COPY, bottom, b0, 16); \
+    } \
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+    top4        = right4 * x_lerp + left4; \
+    VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+    VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+    bottom4      = right4 * x_lerp + left4; \
+    bottom4     -= top4; \
+    float4 dst4  = bottom4 * y_lerp + top4; \
+    half4 tmp; \
+    _viv_asm(CONV, tmp, dst4); \
+    VXC_DP2x8(top, tmp, tmp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniExtactHalf8_2x8); \
+    vxc_short4 result; \
+    _viv_asm(COPY, result, top, 8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+
+__kernel void bilinear_grid_sample_reflect_F16_F32toF16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+    coord_in1.z  = coord_in1.z + 4;
+
+    float4 fxy0 = read_imagef(input1, coord_in1.xy);
+    float4 fxy1 = read_imagef(input1, coord_in1.zw);
+
+    GRID_SAMPLE_F16_PROCESS();
+
+}
+
+_viv_uniform int input1_ZP;
+_viv_uniform float input1Scale;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
+
+__kernel void bilinear_grid_sample_reflect_F16_U8toF16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+    coord_in1.xz = coord_in1.xz * 2;
+    vxc_uchar16 read_coord;
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+    float4 fxy0;
+    float4 fxy1;
+    unsigned char input1ZP;
+    _viv_asm(COPY, input1ZP, input1_ZP, 4);
+    VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
+    fxy0 = fxy0 * input1Scale;
+    fxy1 = fxy1 * input1Scale;
+
+    GRID_SAMPLE_F16_PROCESS();
+
+}
+
+
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;
+
+__kernel void bilinear_grid_sample_reflect_F16_F16toF16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_short8 read_val;
+    vxc_half8  read_coord;
+
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, read_coord, read_val, 16);
+
+    float4 fxy0;
+    float4 fxy1;
+
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
+
+    GRID_SAMPLE_F16_PROCESS();
+
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_F16_to_U8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_F16_to_U8.vx
@ -0,0 +1,224 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+_viv_uniform VXC_512Bits uniEvenBintoFp32_4x4;
+_viv_uniform VXC_512Bits uniOddSubEvenBin_4x4;
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
+_viv_uniform float uint8Scale;
+_viv_uniform float output_ZP;
+_viv_uniform float4 span_wh;
+_viv_uniform float4 min_val_wh;
+
+#define GRID_SAMPLE_F16_to_U8_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy0 = fabs(fxy0 - min_val_wh); \
+    fxy1 = fabs(fxy1 - min_val_wh); \
+    float4 flips_xy0 = floor(fxy0 / span_wh); \
+    float4 flips_xy1 = floor(fxy1 / span_wh); \
+    float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
+    float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
+    int4 flips_int_xy0 = convert_int4(flips_xy0); \
+    int4 flips_int_xy1 = convert_int4(flips_xy1); \
+    fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
+    fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    float4 x_f         = floor(in_x); \
+    float4 x_lerp      = in_x - x_f; \
+    int4   x_idx       = convert_int4(x_f); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    float4  y_f         = floor(in_y); \
+    float4  y_lerp       = in_y - y_f; \
+    int4    y_idx        = convert_int4(y_f); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_short8 t0; \
+    vxc_short8 b0; \
+    vxc_uchar16 result; \
+    vxc_half8 top; \
+    vxc_half8 bottom; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    _viv_asm(COPY, top, t0, 16); \
+    _viv_asm(COPY, bottom, b0, 16); \
+    float4 left4; \
+    float4 right4; \
+    float4 top4; \
+    float4 bottom4; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    while (coord_in.z < loop) \
+    { \
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+        top4        = right4 * x_lerp + left4; \
+        VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+        VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+        bottom4      = right4 * x_lerp + left4; \
+        bottom4     -= top4; \
+        float4 dst4  = bottom4 * y_lerp + top4; \
+        dst4         = dst4 * uint8Scale + output_ZP; \
+        int4 dst     = convert_int4_rte(dst4); \
+        VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
+        result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, t0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, b0, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        _viv_asm(COPY, top, t0, 16); \
+        _viv_asm(COPY, bottom, b0, 16); \
+    } \
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+    top4        = right4 * x_lerp + left4; \
+    VXC_DP4x4(left4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniEvenBintoFp32_4x4); \
+    VXC_DP4x4(right4, bottom, bottom, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniOddSubEvenBin_4x4); \
+    bottom4      = right4 * x_lerp + left4; \
+    bottom4     -= top4; \
+    float4 dst4  = bottom4 * y_lerp + top4; \
+    dst4         = dst4 * uint8Scale + output_ZP; \
+    int4 dst     = convert_int4_rte(dst4); \
+    VXC_DP2x8(result, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, \
+    result, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+__kernel void bilinear_grid_sample_reflect_F16_F32toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+    coord_in1.z  = coord_in1.z + 4;
+
+    float4 fxy0 = read_imagef(input1, coord_in1.xy);
+    float4 fxy1 = read_imagef(input1, coord_in1.zw);
+    GRID_SAMPLE_F16_to_U8_PROCESS();
+
+}
+
+_viv_uniform int input1_ZP;
+_viv_uniform float input1Scale;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
+
+
+__kernel void bilinear_grid_sample_reflect_F16_U8toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_uchar16 read_coord;
+
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    float4 fxy0;
+    float4 fxy1;
+
+    unsigned char input1ZP;
+    _viv_asm(COPY, input1ZP, input1_ZP, 4);
+
+    VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
+
+    fxy0 = fxy0 * input1Scale;
+    fxy1 = fxy1 * input1Scale;
+
+    GRID_SAMPLE_F16_to_U8_PROCESS();
+
+}
+
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;
+
+__kernel void bilinear_grid_sample_reflect_F16_F16toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_short8 read_val;
+    vxc_half8  read_coord;
+
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, read_coord, read_val, 16);
+
+    float4 fxy0;
+    float4 fxy1;
+
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
+
+    GRID_SAMPLE_F16_to_U8_PROCESS();
+
+}
+
--- a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_I16_to_I16.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_I16_to_I16.vx
@ -0,0 +1,160 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;
+_viv_uniform VXC_512Bits uniRightSubLeft_4x4;
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
+_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;
+_viv_uniform float input1_scale;
+_viv_uniform float dfpScale;
+_viv_uniform float4 span_wh;
+_viv_uniform float4 min_val_wh;
+
+#define GRID_SAMPLE_I16_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy0 = fabs(fxy0 - min_val_wh); \
+    fxy1 = fabs(fxy1 - min_val_wh); \
+    float4 flips_xy0 = floor(fxy0 / span_wh); \
+    float4 flips_xy1 = floor(fxy1 / span_wh); \
+    float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
+    float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
+    int4 flips_int_xy0 = convert_int4(flips_xy0); \
+    int4 flips_int_xy1 = convert_int4(flips_xy1); \
+    fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
+    fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    float4 x_f         = floor(in_x); \
+    float4 x_lerp      = in_x - x_f; \
+    int4   x_idx       = convert_int4(x_f); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    float4  y_f         = floor(in_y); \
+    float4  y_lerp       = in_y - y_f; \
+    int4    y_idx        = convert_int4(y_f); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_short8 top; \
+    vxc_short8 bottom; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    float4 left4; \
+    float4 right4; \
+    float4 top4; \
+    float4 bottom4; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    while (coord_in.z < loop) \
+    { \
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+        top4        = right4 * x_lerp + left4; \
+        VXC_DP4x4(left4, bottom, bottom, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+        VXC_DP4x4(right4, bottom, bottom, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+        bottom4      = right4 * x_lerp + left4; \
+        bottom4     -= top4; \
+        float4 dst4  = bottom4 * y_lerp + top4; \
+        dst4         = dst4 * dfpScale; \
+        int4 dst     = convert_int4_rte(dst4); \
+        VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+    top4        = right4 * x_lerp + left4; \
+    VXC_DP4x4(left4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+    VXC_DP4x4(right4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+    bottom4      = right4 * x_lerp + left4; \
+    bottom4     -= top4; \
+    float4 dst4  = bottom4 * y_lerp + top4; \
+    dst4         = dst4 * dfpScale; \
+    int4 dst     = convert_int4_rte(dst4); \
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+
+__kernel void bilinear_grid_sample_reflect_I16_I16toI16(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+    vxc_short8 read_coord;
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    float4 fxy0;
+    float4 fxy1;
+
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);
+
+    fxy0 = fxy0 * input1_scale;
+    fxy1 = fxy1 * input1_scale;
+
+    GRID_SAMPLE_I16_PROCESS();
+
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_I8_to_I8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_I8_to_I8.vx
@ -0,0 +1,160 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+_viv_uniform VXC_512Bits uniDFPtoFp32_left_4x4;
+_viv_uniform VXC_512Bits uniRightSubLeft_4x4;
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
+_viv_uniform VXC_512Bits uniDFPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniDFPtoFp32_part1_4x4;
+_viv_uniform float input1_scale;
+_viv_uniform float dfpScale;
+_viv_uniform float4 span_wh;
+_viv_uniform float4 min_val_wh;
+
+#define GRID_SAMPLE_I8_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy0 = fabs(fxy0 - min_val_wh); \
+    fxy1 = fabs(fxy1 - min_val_wh); \
+    float4 flips_xy0 = floor(fxy0 / span_wh); \
+    float4 flips_xy1 = floor(fxy1 / span_wh); \
+    float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
+    float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
+    int4 flips_int_xy0 = convert_int4(flips_xy0); \
+    int4 flips_int_xy1 = convert_int4(flips_xy1); \
+    fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
+    fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    float4 x_f         = floor(in_x); \
+    float4 x_lerp      = in_x - x_f; \
+    int4   x_idx       = convert_int4(x_f); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    float4  y_f         = floor(in_y); \
+    float4  y_lerp       = in_y - y_f; \
+    int4    y_idx        = convert_int4(y_f); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_char16 top; \
+    vxc_char16 bottom; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    float4 left4; \
+    float4 right4; \
+    float4 top4; \
+    float4 bottom4; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    while (coord_in.z < loop) \
+    { \
+        VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+        top4        = right4 * x_lerp + left4; \
+        VXC_DP4x4(left4, bottom, bottom, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+        VXC_DP4x4(right4, bottom, bottom, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+        bottom4      = right4 * x_lerp + left4; \
+        bottom4     -= top4; \
+        float4 dst4  = bottom4 * y_lerp + top4; \
+        dst4         = dst4 * dfpScale; \
+        int4 dst     = convert_int4_rte(dst4); \
+        VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+    VXC_DP4x4(left4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+    top4        = right4 * x_lerp + left4; \
+    VXC_DP4x4(left4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_left_4x4); \
+    VXC_DP4x4(right4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniRightSubLeft_4x4); \
+    bottom4      = right4 * x_lerp + left4; \
+    bottom4     -= top4; \
+    float4 dst4  = bottom4 * y_lerp + top4; \
+    dst4         = dst4 * dfpScale; \
+    int4 dst     = convert_int4_rte(dst4); \
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+
+__kernel void bilinear_grid_sample_reflect_I8_I8toI8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+    vxc_char16 read_coord;
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    float4 fxy0;
+    float4 fxy1;
+
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniDFPtoFp32_part1_4x4);
+
+    fxy0 = fxy0 * input1_scale;
+    fxy1 = fxy1 * input1_scale;
+
+    GRID_SAMPLE_I8_PROCESS();
+
+}
--- a/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_U8_to_U8.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/bilinear_grid_sample_reflect_U8_to_U8.vx
@ -0,0 +1,224 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform float2 half_input0_wh;
+_viv_uniform float2 add_float_value;
+_viv_uniform int depth;
+
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_left_4x4;
+_viv_uniform VXC_512Bits uniU8RightSubLeft_4x4;
+_viv_uniform VXC_512Bits uniExtact8Bit_2x8;
+_viv_uniform int input_ZP;
+_viv_uniform float uint8Scale;
+_viv_uniform float output_ZP;
+_viv_uniform int input1_ZP;
+_viv_uniform float input1Scale;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniU8SubZPtoFp32_part1_4x4;
+_viv_uniform float4 span_wh;
+_viv_uniform float4 min_val_wh;
+
+#define GRID_SAMPLE_U8_PROCESS() \
+    fxy0 = fxy0 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy1 = fxy1 * half_input0_wh.xyxy + add_float_value.xyxy; \
+    fxy0 = fabs(fxy0 - min_val_wh); \
+    fxy1 = fabs(fxy1 - min_val_wh); \
+    float4 flips_xy0 = floor(fxy0 / span_wh); \
+    float4 flips_xy1 = floor(fxy1 / span_wh); \
+    float4 extra_xy0 = fxy0 - flips_xy0 * span_wh; \
+    float4 extra_xy1 = fxy1 - flips_xy1 * span_wh; \
+    int4 flips_int_xy0 = convert_int4(flips_xy0); \
+    int4 flips_int_xy1 = convert_int4(flips_xy1); \
+    fxy0 = ((flips_int_xy0 % 2) == 0) ? min_val_wh + extra_xy0 : min_val_wh + (span_wh - extra_xy0); \
+    fxy1 = ((flips_int_xy1 % 2) == 0) ? min_val_wh + extra_xy1 : min_val_wh + (span_wh - extra_xy1); \
+    float4 in_x        = (float4)(fxy0.xz, fxy1.xz); \
+    float4 x_f         = floor(in_x); \
+    float4 x_lerp      = in_x - x_f; \
+    int4   x_idx       = convert_int4(x_f); \
+    float4  in_y        = (float4)(fxy0.yw, fxy1.yw); \
+    float4  y_f         = floor(in_y); \
+    float4  y_lerp       = in_y - y_f; \
+    int4    y_idx        = convert_int4(y_f); \
+    int4 coord_in = (int4)(x_idx.x, y_idx.x, 0, 0); \
+    int8 input_desc; \
+    _viv_asm(COPY, input_desc, input0, sizeof(input_desc)); \
+    int baseAddr = input_desc.s0; \
+    _viv_asm(MOV, coord_in.w, baseAddr); \
+    vxc_uchar16 top; \
+    vxc_uchar16 bottom; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.y; \
+    coord_in.y = y_idx.y; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.z; \
+    coord_in.y = y_idx.z; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+    coord_in.x = x_idx.w; \
+    coord_in.y = y_idx.w; \
+    VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+    VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    float4 left4; \
+    float4 right4; \
+    float4 top4; \
+    float4 bottom4; \
+    int8 output_desc; \
+    _viv_asm(COPY, output_desc, output, sizeof(output_desc)); \
+    baseAddr = (int)coord_out.z * output_desc.s4 + output_desc.s0; \
+    _viv_asm(MOV, coord_out.w, baseAddr); \
+    int loop = depth - 1; \
+    while (coord_in.z < loop) \
+    { \
+        unsigned char inputZP; \
+        _viv_asm(COPY, inputZP, input_ZP, 4); \
+        VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \
+        VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \
+        top4        = right4 * x_lerp + left4; \
+        VXC_DP4x4(left4, bottom, inputZP, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \
+        VXC_DP4x4(right4, bottom, bottom, \
+        VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \
+        bottom4      = right4 * x_lerp + left4; \
+        bottom4     -= top4; \
+        float4 dst4  = bottom4 * y_lerp + top4; \
+        dst4         = dst4 * uint8Scale + output_ZP; \
+        int4 dst     = convert_int4_rte(dst4); \
+        VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+        VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0)); \
+        coord_out.zw = coord_out.zw + (int2)(1, output_desc.s4); \
+        coord_in.zw = coord_in.zw + (int2)(1, input_desc.s4); \
+        coord_in.x = x_idx.x; \
+        coord_in.y = y_idx.x; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.y; \
+        coord_in.y = y_idx.y; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(2, 3, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.z; \
+        coord_in.y = y_idx.z; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(4, 5, 0, VXC_RM_TowardZero, 0)); \
+        coord_in.x = x_idx.w; \
+        coord_in.y = y_idx.w; \
+        VXC_OP4(img_load_3d, top, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 0), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+        VXC_OP4(img_load_3d, bottom, input0, coord_in.xywz, \
+        VXC_5BITOFFSET_XY(0, 1), VXC_MODIFIER(6, 7, 0, VXC_RM_TowardZero, 0)); \
+    } \
+    unsigned char inputZP; \
+    _viv_asm(COPY, inputZP, input_ZP, 4); \
+    VXC_DP4x4(left4, top, inputZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \
+    VXC_DP4x4(right4, top, top, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \
+    top4        = right4 * x_lerp + left4; \
+    VXC_DP4x4(left4, bottom, inputZP, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_left_4x4); \
+    VXC_DP4x4(right4, bottom, bottom, \
+    VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8RightSubLeft_4x4); \
+    bottom4      = right4 * x_lerp + left4; \
+    bottom4     -= top4; \
+    float4 dst4  = bottom4 * y_lerp + top4; \
+    dst4         = dst4 * uint8Scale + output_ZP; \
+    int4 dst     = convert_int4_rte(dst4); \
+    VXC_DP2x8(top, dst, dst, VXC_MODIFIER(0, 3, 0, VXC_RM_ToNearestEven, 1), uniExtact8Bit_2x8); \
+    VXC_OP4_NoDest(img_store_3d, output, coord_out.xywz, top, VXC_MODIFIER(0, 3, 0,VXC_RM_TowardZero, 0));
+
+
+__kernel void bilinear_grid_sample_reflect_U8_F32toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+    coord_in1.z  = coord_in1.z + 4;
+
+    float4 fxy0 = read_imagef(input1, coord_in1.xy);
+    float4 fxy1 = read_imagef(input1, coord_in1.zw);
+    GRID_SAMPLE_U8_PROCESS();
+
+}
+
+
+__kernel void bilinear_grid_sample_reflect_U8_U8toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_uchar16 read_coord;
+
+    VXC_ReadImage(read_coord, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    float4 fxy0;
+    float4 fxy1;
+
+    unsigned char input1ZP;
+    _viv_asm(COPY, input1ZP, input1_ZP, 4);
+
+    VXC_DP4x4(fxy0, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, input1ZP, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniU8SubZPtoFp32_part1_4x4);
+
+    fxy0 = fxy0 * input1Scale;
+    fxy1 = fxy1 * input1Scale;
+
+    GRID_SAMPLE_U8_PROCESS();
+
+}
+
+_viv_uniform VXC_512Bits uniFp16toFp32_part0_4x4;
+_viv_uniform VXC_512Bits uniFp16toFp32_part1_4x4;
+
+__kernel void bilinear_grid_sample_reflect_U8_F16toU8(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_array_t  output,
+    int align_corners)
+{
+    int4 coord_out = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+    int4 coord_in1 = coord_out.xyxy;
+
+    coord_in1.xz = coord_in1.xz * 2;
+
+    vxc_short8 read_val;
+    vxc_half8  read_coord;
+
+    VXC_ReadImage(read_val, input1, coord_in1.xy, VXC_5BITOFFSET_XY(0, 0), \
+                VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+
+    _viv_asm(COPY, read_coord, read_val, 16);
+
+    float4 fxy0;
+    float4 fxy1;
+
+    VXC_DP4x4(fxy0, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part0_4x4);
+    VXC_DP4x4(fxy1, read_coord, read_coord, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniFp16toFp32_part1_4x4);
+
+    GRID_SAMPLE_U8_PROCESS();
+
+}
+
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c
@ -39,6 +39,7 @@
 #include "vsi_nn_internal_node.h"
 #include "utils/vsi_nn_constraint_check.h"
 #include "utils/vsi_nn_dtype_util.h"
+#include "utils/vsi_nn_dtype_util_prv.h"
 #include "vsi_nn_error.h"

 #define _INPUT_NUM          (1)
@ -199,14 +200,11 @@ static vsi_bool op_setup
    vsi_nn_internal_node_t* curr = NULL;
    float min = self->nn_param.clip.min;
    float max = self->nn_param.clip.max;
-    uint32_t infinity = VSI_NN_FLOAT32_INF;
-    float neg_infinity = -*(float*)&infinity;
-    int32_t max_float = *(int32_t*)&max;

    if ( (min == -1.0f && max == 1.0f)
      || (min == 0.0f && max == 6.0f)
-      || (min == 0.0f && max_float == VSI_NN_FLOAT32_INF)
-      || (min == neg_infinity && max_float == VSI_NN_FLOAT32_INF))
+      || (min == 0.0f && fp32_is_inf(max))
+      || (fp32_is_inf(-min) && fp32_is_inf(max)))
    {
        vsi_nn_internal_init_node_wksp(self);
        if (min == -1.0f && max == 1.0f)
@ -217,7 +215,7 @@ static vsi_bool op_setup
        {
            curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU6, 0, 0);
        }
-        else if (min == 0.0f && max_float == VSI_NN_FLOAT32_INF)
+        else if (min == 0.0f && fp32_is_inf(max))
        {
            curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU, 0, 0);
        }
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c
@ -430,10 +430,13 @@ static vsi_bool op_setup
    // create activation output/hstate_output/cstate_output
    vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]->attr.dtype, TRUE);
    act_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+    CHECK_PTR_FAIL_GOTO(act_out, "Create internal tensor failed", final);
    vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_CELL_OUT_H_STATE]->attr.dtype, TRUE);
    act_h_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+    CHECK_PTR_FAIL_GOTO(act_h_out, "Create internal tensor failed", final);
    vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_CELL_OUT_C_STATE]->attr.dtype, TRUE);
    act_c_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
+    CHECK_PTR_FAIL_GOTO(act_c_out, "Create internal tensor failed", final);
    curr->outputs[LSTMUNIT_ACT_OUTPUT] = act_out->t;
    curr->outputs[LSTMUNIT_ACT_HSTATE_OUT] = act_h_out->t;
    curr->outputs[LSTMUNIT_ACT_CSTATE_OUT] = act_c_out->t;
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c
@ -89,7 +89,7 @@ static vsi_status op_grouped_compute
    if (NULL == LOCAL()->weight_tensor_group)
    {
        VSILOGE("Malloc fail, (GROUPED_DECONV2D) at [%s : %d]\n", __FILE__, __LINE__);
-        return VSI_FAILURE;
+        goto final;
    }
    memset(LOCAL()->weight_tensor_group, 0, group * sizeof(vsi_nn_tensor_t *));
    res = vsi_nn_CreateTensorGroup(self->graph, inputs[1], 2,
@ -325,8 +325,8 @@ static vsi_status op_compute
 #endif
    // param.a_x = self->nn_param.deconv.dilation;
    // param.a_y = self->nn_param.deconv.dilation;
-    param.ext.khr.a_x = 1;
-    param.ext.khr.a_y = 1;
+    param.ext.khr.a_x = (size_t)self->nn_param.deconv.output_padding[0];
+    param.ext.khr.a_y = (size_t)self->nn_param.deconv.output_padding[1];
    param.ext.khr.padding_x = (size_t)self->nn_param.deconv.pad[0];
    param.ext.khr.padding_y = (size_t)self->nn_param.deconv.pad[2];
    param.ext.khr.overflow_policy = self->vx_param.overflow_policy;
@ -336,6 +336,7 @@ static vsi_status op_compute
    param.ext.channel_group = self->nn_param.deconv.group;
    param.stride_x = self->nn_param.deconv.stride[0];
    param.stride_y = self->nn_param.deconv.stride[1];
+    param.down_scale_size_rounding = self->vx_param.down_scale_size_rounding;
    //param.border_mode;
    //param.border_const;

--- a/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_extra_ending.c
@ -66,7 +66,7 @@ static vsi_status op_compute
    input_tensor[1] = tmp_tensor;

    self->n = (vx_node)vsi_nn_kernel_selector( self->graph,
-        "signal_frame",
+        "extra_ending",
        input_tensor, 2,
        outputs, 1, NULL );

--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c
@ -65,9 +65,15 @@ static vsi_status op_compute
    }
    else
    {
+#define _TENSOR_LEN 64
        vsi_nn_tensor_attr_t attr;
        vsi_nn_tensor_t* temp_tensors = NULL;

+        char gather_tensor_name[_TENSOR_LEN];
+        char copy_tensor_name[_TENSOR_LEN];
+        memset(gather_tensor_name, 0, sizeof(gather_tensor_name));
+        memset(copy_tensor_name, 0, sizeof(copy_tensor_name));
+
        VSILOGW("gather is no_range_change operation! \
            Insert DataConvert Operation when the quantization parameters of input and output are inconsistent!");

@ -78,7 +84,20 @@ static vsi_status op_compute
        temp_tensors = vsi_nn_CreateTensor( self->graph, &attr );

        vsi_nn_kernel_selector( self->graph, "gather", inputs, 2, &temp_tensors, 1, param );
+        snprintf(gather_tensor_name, sizeof(gather_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 0);
+        if(vxSetReferenceName((vx_reference)temp_tensors->t, gather_tensor_name) == VSI_FAILURE)
+        {
+            VSILOGW("Set uid %u gather node output name fail", self->uid);
+            return VSI_FAILURE;
+        }
+
        n = vxTensorCopyNode( self->graph->g, temp_tensors->t, outputs[0]->t);
+        snprintf(copy_tensor_name, sizeof(copy_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 1);
+        if(vxSetReferenceName((vx_reference)outputs[0]->t, copy_tensor_name) == VSI_FAILURE)
+        {
+            VSILOGW("Set uid %u copy node output name fail", self->uid);
+            return VSI_FAILURE;
+        }

        vsi_safe_release_tensor(temp_tensors);
    }
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_elements.c
@ -38,6 +38,7 @@
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
 #include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"

 typedef struct _gather_elements_local_data_t {
    int32_t placeholder;
@ -101,6 +102,7 @@ static vsi_status op_compute
        attr.is_const = FALSE;
        attr.vtl = TRUE;
        temp_tensors = vsi_nn_CreateTensor( self->graph, &attr );
+        CHECK_PTR_FAIL_GOTO( temp_tensors, "Create tensor fail.", final );
    }
    else
    {
@ -148,6 +150,7 @@ static vsi_status op_compute
        vsi_safe_release_tensor(temp_tensors);
    }

+final:
    vsi_nn_kernel_param_release( &param );

    if ( self->n )
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c
@ -53,11 +53,13 @@ static vsi_status op_compute

    vsi_nn_kernel_param_t* param = NULL;
    int32_t align_corners = self->nn_param.gridsample.align_corners;
+    int32_t pad_mode = (int32_t)self->nn_param.gridsample.padding_mode;
    vsi_nn_kernel_node_t n;
    char kernel_name[128];

    param = vsi_nn_kernel_param_create();
    vsi_nn_kernel_param_add_int32(param, "align_corners", align_corners);
+    vsi_nn_kernel_param_add_int32(param, "padding_mode", pad_mode);

    switch (self->nn_param.gridsample.mode) {
        case VSI_NN_INTERPOLATION_BILINEAR:
@ -103,13 +105,20 @@ static vsi_bool op_check
        return FALSE;
    }

-    if (!((VSI_NN_PAD_MODE_CONSTANT ==
+    if ((VSI_NN_PAD_MODE_CONSTANT ==
           self->nn_param.gridsample.padding_mode) &&
-          (0 == self->nn_param.gridsample.const_val))) {
+          (0 != self->nn_param.gridsample.const_val)) {
        VSILOGE("Only support padding const 0 now!");
        return FALSE;
    }

+
+    if (VSI_NN_PAD_MODE_SYMMETRIC ==
+        self->nn_param.gridsample.padding_mode) {
+        VSILOGE("Can't support VSI_NN_PAD_MODE_SYMMETRIC now!");
+        return FALSE;
+    }
+
    return TRUE;
 } /* op_check() */

@ -124,6 +133,11 @@ static vsi_bool op_setup
        return FALSE;
    }

+    if (2 != inputs[1]->attr.size[0])
+    {
+        return FALSE;
+    }
+
    if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num) {
        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
        outputs[0]->attr.size[0] = inputs[1]->attr.size[1];
@ -133,6 +147,16 @@ static vsi_bool op_setup
            outputs[0]->attr.size[3] = inputs[0]->attr.size[3];
        }
    }
+    else
+    {
+        if ((outputs[0]->attr.dim_num != inputs[0]->attr.dim_num)
+            || (outputs[0]->attr.size[0] != inputs[1]->attr.size[1])
+            || (outputs[0]->attr.size[1] != inputs[1]->attr.size[2]))
+        {
+            return FALSE;
+        }
+
+    }

    return TRUE;
 } /* op_setup() */
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c
@ -121,6 +121,7 @@ static vsi_status op_compute
    vsi_nn_tensor_t ** outputs
    )
 {
+#define _TENSOR_LEN 64
    vsi_status status = VSI_FAILURE;
    vsi_nn_kernel_param_t * param = NULL;
    vsi_nn_kernel_node_t    n = NULL;
@ -129,15 +130,36 @@ static vsi_status op_compute
    vsi_size_t new_rank = 0;
    vsi_nn_tensor_t * tmp_tensors[4] = {NULL};

+    char reshape0_tensor_name[_TENSOR_LEN];
+    char reshape1_tensor_name[_TENSOR_LEN];
+    char instance_norm_tensor_name[_TENSOR_LEN];
+
+    memset(reshape0_tensor_name, 0, sizeof(reshape0_tensor_name));
+    memset(reshape1_tensor_name, 0, sizeof(reshape1_tensor_name));
+    memset(instance_norm_tensor_name, 0, sizeof(instance_norm_tensor_name));
+
    vsi_nn_optimize_instance_norm_shape(inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank);

    tmp_tensors[0] = vsi_nn_kernel_insert_reshape_node( self->graph,
        inputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_BACKWARD );
+    snprintf(reshape0_tensor_name, sizeof(reshape0_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 0);
+    if(vxSetReferenceName((vx_reference)tmp_tensors[0]->t, reshape0_tensor_name) == VSI_FAILURE)
+    {
+        VSILOGW("Set uid %u reshape 0 node output name fail", self->uid);
+        return VSI_FAILURE;
+    }
    tmp_tensors[1] = inputs[1];
    tmp_tensors[2] = inputs[2];
    tmp_tensors[3] = vsi_nn_kernel_insert_reshape_node( self->graph,
            outputs[0], shape, (uint32_t)new_rank, VSI_NN_OPTIMIZE_FORWARD );

+    snprintf(reshape1_tensor_name, sizeof(reshape1_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 1);
+    if(vxSetReferenceName((vx_reference)outputs[0]->t, reshape1_tensor_name) == VSI_FAILURE)
+    {
+        VSILOGW("Set uid %u reshap 1 node output name fail", self->uid);
+        return VSI_FAILURE;
+    }
+
    status = _try_set_high_presision_tensor(tmp_tensors);
    if (status != VSI_SUCCESS)
    {
@ -155,6 +177,12 @@ static vsi_status op_compute
        self->n = (vx_node)n;
        status = VSI_SUCCESS;
    }
+    snprintf(instance_norm_tensor_name, sizeof(instance_norm_tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, 2);
+    if(vxSetReferenceName((vx_reference)tmp_tensors[3]->t, instance_norm_tensor_name) == VSI_FAILURE)
+    {
+        VSILOGW("Set uid %u instance_norm node output name fail", self->uid);
+        return VSI_FAILURE;
+    }

    if (param != NULL)
    {
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c
@ -259,9 +259,9 @@ static vsi_status op_deinit
    vsi_nn_node_t * self
    )
 {
-    vsi_nn_safe_free(self->nn_param.layernorm.local);

 #if (!VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
+    vsi_nn_safe_free(self->nn_param.layernorm.local);
    vsi_nn_internal_deinit_node_wksp( self );
 #endif

--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
@ -161,14 +161,6 @@ static vsi_bool _check_is_sp_supported_type
 {
    int32_t * axes = self->nn_param.reduce.local2->axes;
    int32_t axes_num = self->nn_param.reduce.local2->axes_num;
-    vsi_size_t shapes[4][VSI_NN_MAX_DIM_NUM] = { {0} };
-    int32_t axis_in[VSI_NN_MAX_DIM_NUM] = {0};
-    int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0};
-    int32_t i = 0;
-    uint32_t axis_size = 0;
-    uint32_t rank_in = 0;
-    uint32_t rank_out = 0;
-    vsi_bool ret = FALSE;

    if ( !self->graph->ctx->config.support_stream_processor ||
         (type != VSI_NN_REDUCE_SUM && type != VSI_NN_REDUCE_MEAN && type != VSI_NN_REDUCE_MAX) )
@ -191,22 +183,15 @@ static vsi_bool _check_is_sp_supported_type
        return FALSE;
    }

-    for (i = 0; i < axes_num; i++)
+    if ( (axes_num == 1 && (axes[0] == 0 || axes[0] == 2)) ||
+         (axes_num == 2 && ((axes[0] < 2 && axes[1] < 2) || (axes[0] == 1 && axes[1] == 2))) )
    {
-        shapes[0][i] = input->attr.size[axes[i]];
-        shapes[1][i] = 1;
-        axis_in[i] = i;
+        return TRUE;
    }

-    ret = vsi_nn_kernel_optimize_reduce_shape(
-            shapes[0], axes_num,
-            axis_in, axes_num,
-            shapes[1], axes_num,
-            shapes[2], &rank_in, shapes[3], &rank_out,
-            new_axis, &axis_size);
-
-    return ret && axis_size < 3;
+    return FALSE;
 }
+
 static vsi_status op_compute
    (
    vsi_nn_node_t * self,
@ -839,82 +824,28 @@ static vsi_bool op_set_sp_reduce_internal
    vsi_enum  type_name
    )
 {
-    vsi_nn_tensor_attr_t attr;
-    vsi_nn_internal_tensor_t* tensor1 = NULL;
-    vsi_nn_tensor_t* new_output = NULL;
-    uint32_t* permute_in_perm = NULL;
-    int32_t * new_axis = NULL;
-    vsi_size_t shapes[VSI_NN_MAX_DIM_NUM] = {1};
-    int32_t use_virtual_tensor = TRUE;
    vsi_nn_internal_node_t* tmp_inode = NULL;
    int32_t * axes = self->nn_param.reduce.local2->axes;
    int32_t axes_num = self->nn_param.reduce.local2->axes_num;
-    int32_t i = 0, j = 0, index = 0;
    vsi_size_t reduce_size = 1;
    vsi_bool ret = FALSE;
+    int32_t i = 0;

    vsi_nn_internal_init_node_wksp( self );

-    memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t));
-    memcpy(&attr.dtype, &inputs[0]->attr.dtype, sizeof(vsi_nn_dtype_t));
-    attr.dim_num = VSI_NN_DIM_AUTO;
-    attr.vtl = use_virtual_tensor;
-    attr.is_const = FALSE;
-    tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f);
-    CHECK_PTR_FAIL_GOTO(tensor1, "Create internal tensor failed", final);
-
-    tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_PERMUTE, 0, 0 );
+    tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_REDUCE_MEAN_INTERNAL, 0, 0 );
    CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
-    permute_in_perm = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode,
-        inputs[0]->attr.dim_num * sizeof(uint32_t));
-    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(permute_in_perm, tmp_inode, "Create buffer failed", final);
+    tmp_inode->inputs[0] = inputs[0];
+    tmp_inode->outputs[0] = outputs[0];
+    tmp_inode->node->nn_param.reduce_mean_internal.axis = axes;
+    tmp_inode->node->nn_param.reduce_mean_internal.axis_num = axes_num;
+    tmp_inode->node->nn_param.reduce_mean_internal.type = type_name;

-    for ( i = 0;  i < axes_num; i++)
+    for (i = 0; i < axes_num; i++)
    {
-        shapes[index] = outputs[0]->attr.size[axes[i]];
-        permute_in_perm[index ++] = axes[i];
        reduce_size *= inputs[0]->attr.size[axes[i]];
    }

-    for ( j = 0;  j < (int32_t)inputs[0]->attr.dim_num;  j++)
-    {
-        for (i = 0; i < axes_num; i++)
-        {
-            if (j == axes[i])
-            {
-                break;
-            }
-        }
-        if (i == axes_num)
-        {
-            shapes[index] = outputs[0]->attr.size[j];
-            permute_in_perm[index ++] = j;
-        }
-    }
-    tmp_inode->node->nn_param.permute.perm = permute_in_perm;
-    tmp_inode->node->nn_param.permute.dim_num = inputs[0]->attr.dim_num;
-    tmp_inode->inputs[0] = inputs[0];
-    tmp_inode->outputs[0] = tensor1->t;
-    vsi_nn_internal_setup_node(self, tmp_inode);
-
-    new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], shapes, outputs[0]->attr.dim_num);
-    CHECK_PTR_FAIL_GOTO(new_output, "Create tensor failed", final);
-    self->nn_param.reduce.local2->reshaped_output = new_output;
-
-    tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_REDUCE_MEAN_INTERNAL, 0, 0 );
-    CHECK_PTR_FAIL_GOTO(tmp_inode, "Create internal node failed", final);
-    new_axis = (int32_t *)vsi_nn_internal_new_node_param(tmp_inode,
-        axes_num * sizeof(int32_t));
-    CHECK_PTR_FAIL_GOTO_RLS_INTERNAL_NODE(new_axis, tmp_inode, "Create buffer failed", final);
-    for (i = 0; i < axes_num; i++)
-    {
-        new_axis[i] = i;
-    }
-    tmp_inode->inputs[0] = tensor1->t;
-    tmp_inode->outputs[0] = new_output;
-    tmp_inode->node->nn_param.reduce_mean_internal.axis = new_axis;
-    tmp_inode->node->nn_param.reduce_mean_internal.axis_num = axes_num;
-    tmp_inode->node->nn_param.reduce_mean_internal.type = type_name;
    if (type_name == VSI_NN_REDUCE_SUM)
    {
        tmp_inode->node->nn_param.reduce_mean_internal.scale = 1.0f;
@ -1147,6 +1078,7 @@ static vsi_bool op_set_reduce_internal
            re_sizes[axes[2]] = 1;
            new_output = vsi_nn_reshape_tensor(self->graph, outputs[0], re_sizes, dim_num);
        }
+        self->nn_param.reduce.local2->reshaped_output = new_output;

        curr = vsi_nn_internal_new_node( self, type_name, 0, 0 );
        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
@ -1161,7 +1093,6 @@ static vsi_bool op_set_reduce_internal
            curr->inputs[0]  = tmp_output_tensor[1]->t;
        }
        curr->outputs[0] = new_output;
-        self->nn_param.reduce.local2->reshaped_output = new_output;
        vsi_nn_internal_setup_node(self, curr);
    }
    else
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reducel2.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reducel2.c
@ -136,7 +136,7 @@ static vsi_bool op_setup
    attr.dtype.vx_type = VSI_NN_TYPE_FLOAT32;

    reducesum_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f );
-    CHECK_PTR_FAIL_GOTO(square_tensor, "Create internal tensor failed", final);
+    CHECK_PTR_FAIL_GOTO(reducesum_tensor, "Create internal tensor failed", final);
    reducesum_node = vsi_nn_internal_new_node( self, VSI_NN_OP_REDUCE, 0, 0);
    CHECK_PTR_FAIL_GOTO(reducesum_node, "Create internal node failed", final);

--- a/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras.c
@ -37,6 +37,7 @@
 #include "vsi_nn_error.h"
 #include "vsi_nn_internal_node.h"
 #include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_dtype_util_prv.h"


 static vsi_status op_compute
@ -88,7 +89,6 @@ static vsi_bool op_setup
    float alpha = 0;
    float max_value = 0;
    float threshold = 0;
-    uint32_t max_raw = 0;
    vsi_bool ret = FALSE;

    if ( NULL == self )
@ -101,11 +101,9 @@ static vsi_bool op_setup
    max_value = p->max_value;
    threshold = p->threshold;

-    max_raw = *(uint32_t*)&max_value;
-
    vsi_nn_internal_init_node_wksp(self);

-    if (alpha == 0 && max_raw == VSI_NN_FLOAT32_INF && threshold == 0)
+    if (alpha == 0.0f && fp32_is_inf(max_value) && threshold == 0.0f)
    {
        curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU, 0, 0);
        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
@ -119,14 +117,14 @@ static vsi_bool op_setup
        curr->inputs[0] = inputs[0];
        curr->outputs[0] = outputs[0];
    }
-    else if (alpha == 0 && max_value == 6.0f && threshold == 0)
+    else if (alpha == 0.0f && max_value == 6.0f && threshold == 0.0f)
    {
        curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RELU6, 0, 0);
        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
        curr->inputs[0] = inputs[0];
        curr->outputs[0] = outputs[0];
    }
-    else if (alpha == 0.1 && max_value == VSI_NN_FLOAT32_INF && threshold == 0)
+    else if (alpha == 0.1f && max_value == VSI_NN_FLOAT32_INF && threshold == 0.0f)
    {
        curr = vsi_nn_internal_new_node(self, VSI_NN_OP_LEAKY_RELU, 0, 0);
        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c
@ -70,7 +70,11 @@ static vsi_status op_compute
            self->graph,
            (uint8_t *)self->nn_param.reshape.size,
            &attr);
-
+        if (NULL == dims_tensor)
+        {
+            VSILOGE( "Create tensor fail." );
+            return VSI_FAILURE;
+        }
        reshape_param.dims = REQUIRED_IO(dims_tensor);

        self->n = vxTensorReshapeNode(self->graph->g,
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape2.c
@ -75,6 +75,11 @@ static vsi_status op_compute
            self->graph,
            (uint8_t *)dims_data,
            &attr);
+        if (NULL == dims_tensor)
+        {
+            VSILOGE( "Create tensor fail." );
+            return VSI_FAILURE;
+        }

        reshape_param.dims = REQUIRED_IO(dims_tensor);

--- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_internal.c
@ -52,6 +52,7 @@ static vsi_status op_compute
    vsi_status status = VSI_FAILURE;
    int32_t  align_corners       = self->nn_param.resize_internal.align_corners;
    int32_t  half_pixel_centers  = self->nn_param.resize_internal.half_pixel_centers;
+    int32_t  type                = self->nn_param.resize_internal.type;
    vsi_enum layout = self->nn_param.resize_internal.layout;
    vsi_nn_kernel_param_t * param = NULL;

@ -59,6 +60,7 @@ static vsi_status op_compute

    vsi_nn_kernel_param_add_int32( param, "align_corners",  align_corners );
    vsi_nn_kernel_param_add_int32( param, "half_pixel_centers",  half_pixel_centers );
+    vsi_nn_kernel_param_add_int32( param, "type",  type );

    if (layout == VSI_NN_RESIZE_LAYOUT_NCHW)
    {
@ -186,6 +188,7 @@ static vsi_status op_init
    vsi_status status = VSI_SUCCESS;

    self->nn_param.resize_internal.layout = VSI_NN_RESIZE_LAYOUT_NCHW;
+    self->nn_param.resize_internal.type = VSI_NN_INTERPOLATION_BILINEAR;

    return status;
 } /* op_init() */
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_rmsnorm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rmsnorm.c
@ -0,0 +1,202 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_platform.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_node.h"
+#include "utils/vsi_nn_math.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_tensor_util_prv.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (2)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t* param = NULL;
+    vsi_nn_kernel_node_t    n = NULL;
+    float eps = self->nn_param.rmsnorm.eps;
+    int32_t axis = self->nn_param.rmsnorm.axis;
+
+    param = vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_float32(param, "eps", eps);
+    vsi_nn_kernel_param_add_int32(param, "axis", axis);
+    n = vsi_nn_kernel_selector(self->graph, "rms_norm",
+        inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param);
+    if (n != NULL)
+    {
+        self->n = (vx_node)n;
+        status = VSI_SUCCESS;
+    }
+
+    if (param != NULL)
+    {
+        vsi_nn_kernel_param_release(&param);
+    }
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_bool ret = vsi_nn_is_stream_process_supported_types(self->graph, inputs, self->input.num);
+
+    if (!ret)
+    {
+        BEGIN_IO_TYPE_DECL(RMS_NORM, 2, 1)
+            IO_TYPE(D_F32, D_F32, D_F32)
+            IO_TYPE(D_F16, D_F32, D_F16)
+            IO_TYPE(D_F16, D_F32, D_F16)
+            IO_TYPE(D_F16, D_F32, D_U8 | Q_ASYM)
+            IO_TYPE(D_F16, D_F32, D_U8 | Q_ASYM)
+            IO_TYPE(D_F16, D_F32, D_I8 | Q_DFP)
+            IO_TYPE(D_F16, D_F32, D_I8 | Q_DFP)
+            IO_TYPE(D_F16, D_F32, D_I8 | Q_ASYM)
+            IO_TYPE(D_F16, D_F32, D_I8 | Q_ASYM)
+            IO_TYPE(D_F16, D_F32, D_I8 | Q_SYM)
+            IO_TYPE(D_F16, D_F32, D_I8 | Q_SYM)
+            IO_TYPE(D_F16, D_F32, D_I16 | Q_DFP)
+            IO_TYPE(D_F16, D_F32, D_I16 | Q_DFP)
+            IO_TYPE(D_F16, D_F32, D_I16 | Q_ASYM)
+            IO_TYPE(D_F16, D_F32, D_I16 | Q_ASYM)
+            IO_TYPE(D_F16, D_F32, D_I16 | Q_SYM)
+            IO_TYPE(D_F16, D_F32, D_I16 | Q_SYM)
+            IO_TYPE(D_BF16, D_F32, D_BF16)
+            IO_TYPE(D_U8 | Q_ASYM, D_F32, D_F16)
+            IO_TYPE(D_U8 | Q_ASYM, D_F32, D_U8 | Q_ASYM)
+            IO_TYPE(D_I16 | Q_DFP, D_F32, D_I16 | Q_DFP)
+            IO_TYPE(D_I16 | Q_ASYM, D_F32, D_I16 | Q_ASYM)
+            IO_TYPE(D_I16 | Q_SYM, D_F32, D_I16 | Q_SYM)
+            IO_TYPE(D_I16 | Q_DFP, D_F32, D_F16)
+            IO_TYPE(D_I16 | Q_ASYM, D_F32, D_F16)
+            IO_TYPE(D_I16 | Q_SYM, D_F32, D_F16)
+            IO_TYPE(D_I8 | Q_DFP, D_F32, D_I8 | Q_DFP)
+            IO_TYPE(D_I8 | Q_ASYM, D_F32, D_I8 | Q_ASYM)
+            IO_TYPE(D_I8 | Q_SYM, D_F32, D_I8 | Q_SYM)
+            IO_TYPE(D_I8 | Q_DFP, D_F32, D_F16)
+            IO_TYPE(D_I8 | Q_ASYM, D_F32, D_F16)
+            IO_TYPE(D_I8 | Q_SYM, D_F32, D_F16)
+            IO_TYPE(D_U8 | Q_ASYM, D_F32, D_U8 | Q_ASYM)
+            IO_TYPE(D_U8 | Q_ASYM, D_F32, D_F16)
+            IO_TYPE(D_I16 | Q_DFP, D_F32, D_I16 | Q_DFP)
+            IO_TYPE(D_I16 | Q_ASYM, D_F32, D_I16 | Q_ASYM)
+            IO_TYPE(D_I16 | Q_SYM, D_F32, D_I16 | Q_SYM)
+            IO_TYPE(D_I16 | Q_DFP, D_F32, D_F16)
+            IO_TYPE(D_I16 | Q_ASYM, D_F32, D_F16)
+            IO_TYPE(D_I16 | Q_SYM, D_F32, D_F16)
+            IO_TYPE(D_I8 | Q_DFP, D_F32, D_I8 | Q_DFP)
+            IO_TYPE(D_I8 | Q_ASYM, D_F32, D_I8 | Q_ASYM)
+            IO_TYPE(D_I8 | Q_SYM, D_F32, D_I8 | Q_SYM)
+            IO_TYPE(D_I8 | Q_DFP, D_F32, D_F16)
+            IO_TYPE(D_I8 | Q_ASYM, D_F32, D_F16)
+            IO_TYPE(D_I8 | Q_SYM, D_F32, D_F16)
+            END_IO_TYPE_DECL(RMS_NORM)
+            if (!VALIDATE_OP_IO_TYPES(RMS_NORM, self, inputs, self->input.num, outputs, self->output.num))
+            {
+                char* desc = generate_op_io_types_desc(inputs,
+                    self->input.num, outputs, self->output.num);
+                VSILOGE("Inputs/Outputs data type not support: %s", desc);
+                destroy_op_io_types_desc(desc);
+                return FALSE;
+            }
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_bool ret = TRUE;
+
+    if (NULL == self)
+    {
+        return FALSE;
+    }
+
+    ret = vsi_nn_op_common_setup(self, inputs, outputs);
+
+    return ret;
+} /* op_setup() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t* self
+    )
+{
+    self->nn_param.rmsnorm.axis = 0;
+    self->nn_param.rmsnorm.eps = 1e-8f;
+    return VSI_SUCCESS;
+} /* op_init() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ RMSNORM,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_shape.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_shape.c
@ -0,0 +1,196 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_platform.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "vsi_nn_test.h"
+#include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_dtype_util.h"
+
+typedef struct _shape_local_data_t {
+    vsi_nn_tensor_t *shape_tensor;
+} shape_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+    return vsi_nn_internal_compute_node( self );
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+
+    if (outputs[0]->attr.dtype.vx_type != VSI_NN_TYPE_INT32)
+    {
+        VSILOGD("Outputs data type not support");
+
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_status op_optimize
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs,
+    vsi_nn_opt_direction_e direction
+    )
+{
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+    return vsi_nn_internal_optimize_node( self, direction );
+} /* op_optimize() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_bool ret = TRUE;
+    vsi_nn_internal_node_t* curr = NULL;
+    vsi_nn_tensor_attr_t attr;
+    vx_int32 shapes[VSI_NN_MAX_DIM_NUM] = {0};
+    uint32_t i = 0;
+
+    for ( i = 0; i < inputs[0]->attr.dim_num; i++ )
+    {
+        shapes[i] = (int32_t)inputs[0]->attr.size[i];
+    }
+
+    memset(&attr, 0, sizeof(attr));
+    attr.size[0] = inputs[0]->attr.dim_num;
+    attr.dim_num = 1;
+    attr.is_const = TRUE;
+    attr.dtype.vx_type = VSI_NN_TYPE_INT32;
+    attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
+    self->nn_param.shape.local->shape_tensor = vsi_nn_CreateTensorFromData(
+        self->graph,
+        (uint8_t *)shapes,
+        &attr);
+    if ( NULL == self->nn_param.shape.local->shape_tensor )
+    {
+        VSILOGE("Create shape_tensor fail.(shape)");
+        goto final;
+    }
+
+    vsi_nn_internal_init_node_wksp(self);
+    curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 1, 1);
+    if (NULL == curr)
+    {
+        return ret;
+    }
+    curr->inputs[0]  = self->nn_param.shape.local->shape_tensor;
+    curr->outputs[0] = outputs[0];
+
+    ret = vsi_nn_internal_setup_node(self, curr);
+
+final:
+
+    return ret;
+} /* op_setup() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t* self
+    )
+{
+    self->nn_param.shape.local =
+        (shape_local_data_t *)malloc(sizeof(shape_local_data_t));
+    if (NULL == self->nn_param.shape.local)
+    {
+        return VSI_FAILURE;
+    }
+    memset( self->nn_param.shape.local, 0, sizeof(shape_local_data_t) );
+
+    return VSI_SUCCESS;
+} /* op_init() */
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t * self
+    )
+{
+    if (self->nn_param.shape.local)
+    {
+        vsi_safe_release_tensor(self->nn_param.shape.local->shape_tensor);
+    }
+    vsi_nn_safe_free(self->nn_param.shape.local);
+    vsi_nn_internal_deinit_node_wksp(self);
+    vsi_nn_op_common_deinit(self);
+
+    return VSI_SUCCESS;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ SHAPE,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ op_optimize,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
@ -735,6 +735,15 @@ static vsi_bool op_setup

            outputs[0]->attr.dim_num++;
        }
+
+        /*output dim_num is 0, the tensor should be scalar!*/
+        if (outputs[0]->attr.dim_num == 0)
+        {
+            outputs[0]->attr.dim_num = 1;
+            outputs[0]->attr.size[0] = 1;
+
+            vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
+        }
    }

    _get_stride_slice_start_stop_stride(self, inputs, outputs);
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_topk.c
@ -124,9 +124,6 @@ static vsi_status op_compute
            outputs[0]->attr.size, outputs[0]->attr.dim_num, axis,
            shapes[1], &rank_out, &new_axis1);

-    param = vsi_nn_kernel_param_create();
-    vsi_nn_kernel_param_add_int32( param, "top_k", top_k );
-
    if (ret)
    {
        uint32_t perm_in[VSI_NN_MAX_DIM_NUM] = {0};
@ -195,10 +192,14 @@ static vsi_status op_compute
            outputs_tensor[1] = reshape_tensors[2];
        }

+        param = vsi_nn_kernel_param_create();
+        vsi_nn_kernel_param_add_int32( param, "top_k", top_k );
+
        self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "topk",
                &input_tensor, _INPUT_NUM,
                outputs_tensor, _OUTPUT_NUM, param );

+        vsi_nn_kernel_param_release( &param );
        if (axis != 0)
        {
            _create_permute_node(self, outputs_tensor[0], reshape_tensors[1], perm_out, rank_in, TRUE);
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c
@ -25,6 +25,7 @@

 #include <string.h>
 #include <stdlib.h>
+#include <float.h>

 #include "vsi_nn_types.h"
 #include "vsi_nn_log.h"
@ -47,8 +48,6 @@ typedef struct _upsamplescale_local_data_t {
 #define _INPUT_NUM          (1)
 #define _OUTPUT_NUM         (1)

-#define _EPSILON 1e-8
-
 static vsi_status op_compute
    (
    vsi_nn_node_t * self,
@ -69,7 +68,7 @@ static vsi_status op_compute
    stride = self->nn_param.upsamplescale.stride;
    scale  = self->nn_param.upsamplescale.scale;

-    if (stride == 1 || vsi_nn_abs(scale - 1.0f) == _EPSILON)
+    if (stride == 1 || vsi_nn_abs(scale - 1.0f) < FLT_EPSILON)
    {
        return vsi_nn_internal_compute_node( self );
    }
@ -148,7 +147,7 @@ static vsi_status op_optimize
    VSI_UNREFERENCED(inputs);
    VSI_UNREFERENCED(outputs);

-    if (stride == 1 && vsi_nn_abs(scale - 1.0f) == _EPSILON)
+    if (stride == 1 && vsi_nn_abs(scale - 1.0f) < FLT_EPSILON)
    {
        return vsi_nn_internal_optimize_node( self, direction );
    }
@ -174,7 +173,7 @@ static vsi_bool op_setup

    vsi_nn_internal_init_node_wksp(self);

-    if (stride == 1 && vsi_nn_abs(scale - 1.0f) == _EPSILON)
+    if (stride == 1 && vsi_nn_abs(scale - 1.0f) < FLT_EPSILON)
    {
        curr = vsi_nn_internal_new_node(self, VSI_NN_OP_DATACONVERT, 0, 0);
        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
@ -194,7 +193,7 @@ static vsi_bool op_setup

        ret = vsi_nn_internal_setup_node(self, curr);
    }
-    else if (vsi_nn_abs(scale - 1.0f) == _EPSILON)
+    else if (vsi_nn_abs(scale - 1.0f) < FLT_EPSILON)
    {
        curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RESIZE, 0, 0);
        CHECK_PTR_FAIL_GOTO(curr, "Create internal node failed", final);
--- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
@ -469,6 +469,8 @@ static _op_param_gen_t s_op_gen[] =
    /* REDUCEL2 */              NULL,
    /* CROP_AND_RESIZE */       NULL,
    /* TAN */                   NULL,
+    /* RMSNORM */               NULL,
+    /* SHAPE */                 NULL,
 };
 _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );

--- a/src/tim/vx/internal/src/utils/vsi_nn_util.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c
@ -46,6 +46,11 @@
 #include "utils/vsi_nn_math.h"
 #include "utils/vsi_nn_util.h"
 #include "utils/vsi_nn_dtype_util.h"
+#include "utils/vsi_nn_dtype_util_prv.h"
+
+#if (defined(__ANDROID__)) && (__ANDROID_API__ > 21)
+#include <sys/system_properties.h>
+#endif

 typedef struct _vx_status_desc_t
 {
@ -387,10 +392,11 @@ float vsi_nn_DataAsFloat32
    )
 {
    float val;
-    uint32_t *p = (uint32_t*)(&val);
+    fp32_bit_cast_t fp32_bit_cast;
    int16_t fp16;

-    *p = 0xFFFFFFFF;
+    fp32_bit_cast.data = 0xFFFFFFFF;
+    val = fp32_bit_cast.val;
    switch( type )
    {
    case VSI_NN_TYPE_BOOL8:
@ -1462,11 +1468,15 @@ void vsi_nn_get_tensor_clamp_min_max
    }
    else
    {
-        uint32_t f32_min = 0xff800000;
-        uint32_t f32_max = 0x7f800000;
+        fp32_bit_cast_t fp32_bit_cast;
+        float pos_infinity;
+        float neg_infinity;
+        fp32_bit_cast.data = VSI_NN_FLOAT32_INF;
+        pos_infinity = fp32_bit_cast.val;
+        neg_infinity = -pos_infinity;

-        *clampMin = *(float*)&f32_min;
-        *clampMax = *(float*)&f32_max;
+        *clampMin = neg_infinity;
+        *clampMax = pos_infinity;
    }
 }

--- a/src/tim/vx/internal/src/vsi_nn_context.c
+++ b/src/tim/vx/internal/src/vsi_nn_context.c
@ -93,7 +93,7 @@ final:
    return status;
 }

-#if (defined(__ANDROID__)) && (ANDROID_SDK_VERSION >= 30)
+#if (defined(__ANDROID__)) && ((ANDROID_SDK_VERSION >= 30) || (__ANDROID_API__ >= 30))
 static const char* ENV_ENABLE_SHADER = "vendor.VIV_VX_ENABLE_SHADER";
 static const char* ENV_ENABLE_OPCHECK = "vendor.VSI_NN_ENABLE_OPCHECK";
 static const char* ENV_ENABLE_CONCAT_OPTIMIZE = "vendor.VSI_NN_ENABLE_CONCAT_OPTIMIZE";
--- a/src/tim/vx/internal/src/vsi_nn_graph.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph.c
@ -799,6 +799,7 @@ static vsi_status batchInference_graph
    original_inputs_attr = (vsi_nn_tensor_attr_t*)malloc(sizeof(vsi_nn_tensor_attr_t) * graph->max_node_io);
    original_outputs_attr = (vsi_nn_tensor_attr_t*)malloc(sizeof(vsi_nn_tensor_attr_t) * graph->max_node_io);
    approximateConstTensor = (vsi_nn_tensor_id_t*)malloc(sizeof(vsi_nn_tensor_id_t) * graph->tensor_num);
+    CHECK_PTR_FAIL_GOTO(approximateConstTensor, "Malloc fail.", final);
    memset(approximateConstTensor, -1, sizeof(vsi_nn_tensor_id_t) * graph->tensor_num);

    if (NULL == inputs || NULL == outputs || NULL == original_inputs_attr || NULL == original_outputs_attr)
@ -878,6 +879,7 @@ static vsi_status batchInference_graph
            vsi_size_t iterator_list_index = 0;
            vsi_size_t list_index = 0;
            vsi_size_t* iterator_list = (vsi_size_t*)malloc(sizeof(vsi_size_t) * (batchNum + 1));
+            CHECK_PTR_FAIL_GOTO(iterator_list, "Malloc fail.", final);
            memset(iterator_list, 0, sizeof(uint32_t) * (batchNum + 1));

            if (((vsi_nn_node_prv_t*)node)->split_num > 0)
@ -885,6 +887,7 @@ static vsi_status batchInference_graph
                iterator_list[iterator_list_index++] = ((vsi_nn_node_prv_t*)node)->split_num;
                if (((vsi_nn_node_prv_t*)node)->split_num == 1)
                {/*if user set split_num = 1, there is no need to batch split.*/
+                    vsi_nn_safe_free(iterator_list);
                    continue;
                }
            }
@ -1015,6 +1018,7 @@ static vsi_status batchInference_graph
                }
            }

+            vsi_nn_safe_free(iterator_list);
            /*restore node input batch number*/
            num_of_node_inputs = node->input.num;
            for (k = 0; k < num_of_node_inputs; k++)
@ -1053,7 +1057,7 @@ static vsi_status batchInference_graph
        }
    }

-    final:
+final:
    for (i = 0; i < graph->node_num; i++)
    {
        node_id = nodes_list[i];
@ -1067,7 +1071,7 @@ static vsi_status batchInference_graph
            node->input.num, inputs);
        vsi_nn_GetTensors(graph, node->output.tensors,
            node->output.num, outputs);
-        for (j = 0; j < node->output.num; j++)
+        for (j = 0; outputs && j < node->output.num; j++)
        {
            if (outputs[j] == NULL)
            {
--- a/src/tim/vx/internal/src/vsi_nn_internal_node.c
+++ b/src/tim/vx/internal/src/vsi_nn_internal_node.c
@ -716,6 +716,12 @@ vsi_status vsi_nn_internal_optimize_node
        for ( i = n - 1; i >= 0; i-- )
        {
            curr = (vsi_nn_internal_node_t *)vsi_nn_LinkListGetIndexNode((vsi_nn_link_list_t *)WKSP(node), i);
+            if ( NULL == curr )
+            {
+                VSILOGE("get point fail");
+                status = VSI_FAILURE;
+                break;
+            }
            VSILOGD("Optimize backward for node uid[%u] sub_uid[%u] op[%s]",
                node->uid, curr->node->uid, vsi_nn_OpGetName(curr->node->op));

--- a/src/tim/vx/internal/src/vsi_nn_log.c
+++ b/src/tim/vx/internal/src/vsi_nn_log.c
@ -29,7 +29,7 @@
 #include "vsi_nn_log.h"
 #include "vsi_nn_types.h"

-#if (defined(__ANDROID__)) && (ANDROID_SDK_VERSION >= 30)
+#if (defined(__ANDROID__)) && ((ANDROID_SDK_VERSION >= 30) || (__ANDROID_API__ >= 30))
 static const char* ENV_LOG_LEVEL = "vendor.VSI_NN_LOG_LEVEL";
 #else
 static const char* ENV_LOG_LEVEL = "VSI_NN_LOG_LEVEL";
--- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
+++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c
@ -1013,6 +1013,7 @@ vsi_status vsi_nn_AddBinaryGraphInputsWithCropParamForCropOnly
                                                     sizeof(numParams));
                                if (VSI_SUCCESS != status)
                                {
+                                    vsi_nn_safe_free(nodes);
                                    goto final;
                                }
                                for (p = 0; p < numParams; p++)
--- a/src/tim/vx/internal/src/vsi_nn_tensor.c
+++ b/src/tim/vx/internal/src/vsi_nn_tensor.c
@ -3258,6 +3258,7 @@ static vsi_bool _init_dummy_tensor
        #endif
        // This is a hack that driver doesn't support const scales
        scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.scale_dim);
+        CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final );
        memcpy(scales,
               tensor->attr.dtype.scales,
               tensor->attr.dtype.scale_dim * sizeof(float));
 @ -1 +1 @@
 .1.50
 .2.6