diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def index f511b00..0590aad 100644 --- a/src/tim/vx/internal/include/interface/ops.def +++ b/src/tim/vx/internal/include/interface/ops.def @@ -157,4 +157,9 @@ DEF_OP(ONE_HOT) DEF_OP(NMS) DEF_OP(GROUPED_CONV1D) DEF_OP(SCATTER_ND_UPDATE) -DEF_OP(GELU) \ No newline at end of file +DEF_OP(GELU) +DEF_OP(CONV2D_LSTM) +DEF_OP(CONV2D_LSTM_CELL) +DEF_OP(GRU) +DEF_OP(GRUCELL) +DEF_OP(GRUCELL_ACTIVATION) diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h index 9d89a4a..73cfcd7 100644 --- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h +++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel.h @@ -145,7 +145,7 @@ typedef struct typedef struct { vsi_nn_kernel_dtype_e dtype; - vsi_int_array_t * shape; + vsi_size_array_t * shape; vsi_nn_kernel_quant_type_e quant; union { @@ -395,8 +395,8 @@ void vsi_nn_kernel_tensor_release vsi_nn_kernel_tensor_t vsi_nn_kernel_tensor_reshape ( vsi_nn_kernel_tensor_t tensor, - int32_t * shape, - uint32_t rank + vsi_size_t * shape, + vsi_size_t rank ); vsi_status vsi_nn_kernel_node_pass_param @@ -670,7 +670,7 @@ vsi_status vsi_nn_kernel_register ); vsi_bool vsi_nn_kernel_gpu_check_shape - ( const int32_t * shape, size_t rank ); + ( const vsi_size_t * shape, vsi_size_t rank ); vsi_status vsi_nn_kernel_gpu_add_param ( @@ -738,38 +738,38 @@ vsi_status vsi_nn_kernel_tensor_write size_t size ); -static inline size_t vsi_nn_kernel_tensor_attr_get_size +static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_size ( const vsi_nn_kernel_tensor_attr_t * attr ) { if( !attr ) { return 0; } - return vsi_nn_shape_get_size( attr->shape->data, attr->shape->size ); + return vsi_nn_shape_get_size( attr->shape->data, (vsi_size_t)attr->shape->size ); } /* vsi_nn_kernel_tensor_attr_get_size() */ -static inline size_t vsi_nn_kernel_tensor_attr_get_bytes +static inline vsi_size_t vsi_nn_kernel_tensor_attr_get_bytes ( const vsi_nn_kernel_tensor_attr_t * attr ) { - size_t size; - size_t type_bytes; + vsi_size_t size; + vsi_size_t type_bytes; if( !attr ) { return 0; } size = vsi_nn_kernel_tensor_attr_get_size( attr ); - type_bytes = vsi_nn_kernel_dtype_get_bytes( attr->dtype ); + type_bytes = (vsi_size_t)vsi_nn_kernel_dtype_get_bytes( attr->dtype ); return size * type_bytes; } /* vsi_nn_kernel_tensor_attr_get_bytes() */ static inline void vsi_nn_kernel_tensor_attr_get_stride - ( const vsi_nn_kernel_tensor_attr_t * attr, size_t * out_stride) + ( const vsi_nn_kernel_tensor_attr_t * attr, vsi_size_t * out_stride) { if( !attr || !out_stride ) { return; } - vsi_nn_shape_get_stride( attr->shape->data, attr->shape->size, out_stride ); + vsi_nn_shape_get_stride( attr->shape->data, (vsi_size_t)attr->shape->size, out_stride ); } /* vsi_nn_kernel_tensor_attr_get_size() */ static inline vsi_bool vsi_nn_kernel_tensor_attr_is_quantized @@ -819,7 +819,7 @@ vsi_bool vsi_nn_dtype_convert_float_to_quantize_symm_perchannel ( const float * buffer, size_t size, vsi_nn_kernel_dtype_e dtype, - const int32_t * shape, size_t rank, + const vsi_size_t * shape, size_t rank, const float * scale, size_t scale_size, const int32_t * zero_point, size_t zero_point_size, int32_t channel_dim, @@ -862,7 +862,7 @@ vsi_bool vsi_nn_dtype_convert_quantize_symm_perchannel_to_float ( const void * buffer, size_t size, vsi_nn_kernel_dtype_e dtype, - const int32_t * shape, size_t rank, + const vsi_size_t * shape, size_t rank, const float * scale, size_t scale_size, const int32_t * zero_point, size_t zero_point_size, int32_t channel_dim, @@ -873,9 +873,9 @@ vsi_nn_tensor_t* vsi_nn_pad_tensor ( vsi_nn_graph_t * graph, vsi_nn_tensor_t * input, - int32_t * pad_front, - int32_t * pad_end, - size_t pad_size, + vsi_size_t * pad_front, + vsi_size_t * pad_end, + vsi_size_t pad_size, vsi_nn_pad_mode_e mode, float pad_value ); diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_eltwise.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_eltwise.h index fee8075..892502d 100644 --- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_eltwise.h +++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_eltwise.h @@ -30,20 +30,20 @@ vsi_bool vsi_nn_kernel_optimize_eltwise_shape ( - const int32_t* shape_x, const size_t rank_x, - const int32_t* shape_y, const size_t rank_y, - const int32_t* shape_output, const size_t rank_output, - int32_t* out_shape_x, int32_t* out_shape_y, - int32_t* out_shape_output, uint32_t* out_rank_output + const vsi_size_t* shape_x, const vsi_size_t rank_x, + const vsi_size_t* shape_y, const vsi_size_t rank_y, + const vsi_size_t* shape_output, const vsi_size_t rank_output, + vsi_size_t* out_shape_x, vsi_size_t* out_shape_y, + vsi_size_t* out_shape_output, vsi_size_t* out_rank_output ); vsi_bool vsi_nn_kernel_optimize_broadcast_shape ( - const int32_t** shape_in, const size_t* rank_in, + const vsi_size_t** shape_in, const vsi_size_t* rank_in, const int32_t input_num, - const int32_t* shape_output, const size_t rank_output, - int32_t** out_shape_in, - int32_t* out_shape_output, uint32_t* out_rank_output + const vsi_size_t* shape_output, const vsi_size_t rank_output, + vsi_size_t** out_shape_in, + vsi_size_t* out_shape_output, uint32_t* out_rank_output ); #endif diff --git a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h index 0b65afc..1f4c947 100644 --- a/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h +++ b/src/tim/vx/internal/include/kernel/vsi_nn_kernel_gpu_shape_optimize.h @@ -30,53 +30,53 @@ vsi_bool vsi_nn_kernel_optimize_reduce_shape ( - const int32_t* shape_x, const size_t rank_x, - const int32_t *axis, const size_t axis_size, - const int32_t* shape_output, const size_t rank_output, - int32_t* out_shape_x, uint32_t* out_rank_x, - int32_t* out_shape_output, uint32_t* out_rank_output, + const vsi_size_t* shape_x, const vsi_size_t rank_x, + const int32_t *axis, const vsi_size_t axis_size, + const vsi_size_t* shape_output, const vsi_size_t rank_output, + vsi_size_t* out_shape_x, uint32_t* out_rank_x, + vsi_size_t* out_shape_output, uint32_t* out_rank_output, int32_t* out_axis, uint32_t* out_axis_size ); vsi_bool vsi_nn_kernel_optimize_tensor_shape ( - const int32_t* shape_x, const size_t rank_x, - const int32_t *axis, const size_t axis_size, - int32_t* out_shape_x, uint32_t* out_rank_x, + const vsi_size_t* shape_x, const vsi_size_t rank_x, + const int32_t *axis, const vsi_size_t axis_size, + vsi_size_t* out_shape_x, uint32_t* out_rank_x, int32_t* out_axis, uint32_t* out_axis_size ); vsi_bool vsi_nn_kernel_optimize_element_shape ( - const int32_t* shape_x, const size_t rank_x, - int32_t* out_shape_x, int32_t* out_rank_x + const vsi_size_t* shape_x, const vsi_size_t rank_x, + vsi_size_t* out_shape_x, vsi_size_t* out_rank_x ); vsi_bool vsi_nn_kernel_optimize_softmax_shape ( - const int32_t* shape_x, const size_t rank_x, const int32_t axis, - int32_t* out_shape_x, uint32_t* out_rank_x,int32_t* out_axis + const vsi_size_t* shape_x, const vsi_size_t rank_x, const int32_t axis, + vsi_size_t* out_shape_x, uint32_t* out_rank_x,int32_t* out_axis ); vsi_bool vsi_nn_kernel_optimize_tile_shape ( - const int32_t* shape_x, const size_t rank_x, - const int32_t* multiples, const size_t rank, - const int32_t* shape_output, const size_t rank_output, - int32_t* out_shape_x, int32_t* out_shape_y, - int32_t* out_shape_output, uint32_t* out_rank_output + const vsi_size_t* shape_x, const vsi_size_t rank_x, + const vsi_size_t* multiples, const vsi_size_t rank, + const vsi_size_t* shape_output, const vsi_size_t rank_output, + vsi_size_t* out_shape_x, vsi_size_t* out_shape_y, + vsi_size_t* out_shape_output, vsi_size_t* out_rank_output ); vsi_bool vsi_nn_kernel_optimize_1d_tensor_shape ( - const int32_t* shape, const uint32_t rank, - int32_t* out_shape, uint32_t* out_rank + const vsi_size_t* shape, const uint32_t rank, + vsi_size_t* out_shape, uint32_t* out_rank ); vsi_bool vsi_nn_kernel_optimize_nchw2xhw_shape ( - const int32_t* shape, const uint32_t rank, - int32_t* out_shape, uint32_t* out_rank + const vsi_size_t* shape, const uint32_t rank, + vsi_size_t* out_shape, uint32_t* out_rank ); #endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm.h b/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm.h new file mode 100644 index 0000000..e0eac95 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm.h @@ -0,0 +1,76 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CONV2D_LSTM_H +#define _VSI_NN_OP_CONV2D_LSTM_H + +#include "vsi_nn_types.h" + +enum +{ + CONV2D_LSTM_IN_INPUT = 0, + CONV2D_LSTM_IN_H_STATE = 1, + CONV2D_LSTM_IN_C_STATE = 2, + + CONV2D_LSTM_IN_KERNEL_I2I = 3, + CONV2D_LSTM_IN_KERNEL_I2F = 4, + CONV2D_LSTM_IN_KERNEL_I2C = 5, + CONV2D_LSTM_IN_KERNEL_I2O = 6, + + CONV2D_LSTM_IN_KERNEL_R2I = 7, + CONV2D_LSTM_IN_KERNEL_R2F = 8, + CONV2D_LSTM_IN_KERNEL_R2C = 9, + CONV2D_LSTM_IN_KERNEL_R2O = 10, + + CONV2D_LSTM_IN_BIAS_I = 11, + CONV2D_LSTM_IN_BIAS_F = 12, + CONV2D_LSTM_IN_BIAS_C = 13, + CONV2D_LSTM_IN_BIAS_O = 14, + + CONV2D_LSTM_IN_CNT, + + CONV2D_LSTM_OUT_OUTPUT = 0, + CONV2D_LSTM_OUT_H_STATE = 1, + CONV2D_LSTM_OUT_C_STATE = 2, + + CONV2D_LSTM_OUT_CNT +}; + +typedef struct _vsi_nn_conv2d_lstm_local +{ + void * ptr; +} vsi_nn_conv2d_lstm_local; + +typedef struct _vsi_nn_conv2d_lstm_param +{ + vsi_nn_conv2d_lstm_local * local; + + vsi_nn_activation_e activation; + vsi_nn_activation_e recurrent_activation; + vsi_nn_con2d_lstm_dataformat data_format; + vsi_bool return_sequences; + uint32_t filters; + vsi_nn_conv2d_param conv2d; +} vsi_nn_conv2d_lstm_param; + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm_cell.h b/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm_cell.h new file mode 100644 index 0000000..bd306ad --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_conv2d_lstm_cell.h @@ -0,0 +1,76 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_CONV2D_LSTM_CELL_H +#define _VSI_NN_OP_CONV2D_LSTM_CELL_H + +#include "vsi_nn_types.h" + +#define CONV2D_LSTM_CELL_GATE_NUM 4 // i,f,c,o + +enum +{ + CONV2D_LSTM_CELL_IN_INPUT = 0, + CONV2D_LSTM_CELL_IN_H_STATE = 1, + CONV2D_LSTM_CELL_IN_C_STATE = 2, + + CONV2D_LSTM_CELL_IN_KERNEL_I2I = 3, + CONV2D_LSTM_CELL_IN_KERNEL_I2F = 4, + CONV2D_LSTM_CELL_IN_KERNEL_I2C = 5, + CONV2D_LSTM_CELL_IN_KERNEL_I2O = 6, + + CONV2D_LSTM_CELL_IN_KERNEL_R2I = 7, + CONV2D_LSTM_CELL_IN_KERNEL_R2F = 8, + CONV2D_LSTM_CELL_IN_KERNEL_R2C = 9, + CONV2D_LSTM_CELL_IN_KERNEL_R2O = 10, + + CONV2D_LSTM_CELL_IN_BIAS_I = 11, + CONV2D_LSTM_CELL_IN_BIAS_F = 12, + CONV2D_LSTM_CELL_IN_BIAS_C = 13, + CONV2D_LSTM_CELL_IN_BIAS_O = 14, + + CONV2D_LSTM_CELL_IN_CNT, + + CONV2D_LSTM_CELL_OUT_OUTPUT = 0, + CONV2D_LSTM_CELL_OUT_H_STATE = 1, + CONV2D_LSTM_CELL_OUT_C_STATE = 2, + + CONV2D_LSTM_CELL_OUT_CNT +}; + +typedef struct _vsi_nn_conv2d_lstm_cell_local +{ + void * ptr; +} vsi_nn_conv2d_lstm_cell_local; + +typedef struct _vsi_nn_conv2d_lstm_cell_param +{ + vsi_nn_conv2d_lstm_cell_local * local; + + vsi_nn_activation_e activation; + vsi_nn_activation_e recurrent_activation; + uint32_t filters; + vsi_nn_conv2d_param conv2d; +} vsi_nn_conv2d_lstm_cell_param; + +#endif diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_gru.h b/src/tim/vx/internal/include/ops/vsi_nn_op_gru.h new file mode 100644 index 0000000..fc2c24d --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_gru.h @@ -0,0 +1,78 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_GRU_H +#define _VSI_NN_OP_GRU_H + +#include "vsi_nn_types.h" + +/* Define the inputs and outputs for GRU Layer */ +enum +{ + GRU_IN_INPUT = 0, + GRU_IN_H_STATE = 1, + + /* input kernel */ + GRU_IN_KERNEL_I2Z = 2, + GRU_IN_KERNEL_I2R = 3, + GRU_IN_KERNEL_I2H = 4, + + /* recurrent kernel */ + GRU_IN_KERNEL_R2Z = 5, + GRU_IN_KERNEL_R2R = 6, + GRU_IN_KERNEL_R2H = 7, + + /* input bias */ + GRU_IN_BIAS_I2Z = 8, + GRU_IN_BIAS_I2R = 9, + GRU_IN_BIAS_I2H = 10, + + /* recurrent bias */ + GRU_IN_BIAS_R2Z = 11, + GRU_IN_BIAS_R2R = 12, + GRU_IN_BIAS_R2H = 13, + + GRU_IN_CNT, + + GRU_OUT_OUTPUT = 0, + GRU_OUT_H_STATE = 1, + + GRU_OUT_CNT +}; + +typedef struct _vsi_nn_gru_param +{ + struct _vsi_nn_gru_local * local; + + uint32_t num_units; + vsi_nn_activation_e activation; + vsi_nn_activation_e recurrent_activation; + vsi_bool reset_after; + vsi_bool return_sequences; + vsi_bool time_major; +} vsi_nn_gru_param; +_compiler_assert(offsetof(vsi_nn_gru_param, local) == 0, \ + vsi_nn_gru_h ); + +#endif + diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell.h new file mode 100644 index 0000000..8407bda --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell.h @@ -0,0 +1,84 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_GRUCELL_H +#define _VSI_NN_OP_GRUCELL_H + +#include "vsi_nn_types.h" + +enum +{ + GRUCELL_GATES_Z = 0, + GRUCELL_GATES_R = 1, + GRUCELL_GATES_H = 2, + + GRUCELL_GATE_CNT +}; + +/* Define the inputs and outputs for GRUCell */ +enum +{ + GRUCELL_IN_INPUT = 0, + GRUCELL_IN_H_STATE = 1, + + /* input kernel */ + GRUCELL_IN_KERNEL_I2Z = 2, + GRUCELL_IN_KERNEL_I2R = 3, + GRUCELL_IN_KERNEL_I2H = 4, + + /* recurrent kernel */ + GRUCELL_IN_KERNEL_R2Z = 5, + GRUCELL_IN_KERNEL_R2R = 6, + GRUCELL_IN_KERNEL_R2H = 7, + + /* input bias */ + GRUCELL_IN_BIAS_I2Z = 8, + GRUCELL_IN_BIAS_I2R = 9, + GRUCELL_IN_BIAS_I2H = 10, + + /* recurrent bias */ + GRUCELL_IN_BIAS_R2Z = 11, + GRUCELL_IN_BIAS_R2R = 12, + GRUCELL_IN_BIAS_R2H = 13, + + GRUCELL_IN_CNT, + + GRUCELL_OUT_OUTPUT = 0, + GRUCELL_OUT_H_STATE = 1, + + GRUCELL_OUT_CNT +}; + +typedef struct _vsi_nn_grucell_param +{ + struct _vsi_nn_grucell_local * local; + + uint32_t num_units; + vsi_nn_activation_e activation; + vsi_nn_activation_e recurrent_activation; + vsi_bool reset_after; +} vsi_nn_grucell_param; +_compiler_assert(offsetof(vsi_nn_grucell_param, local) == 0, \ + vsi_nn_conv1d_h ); + +#endif \ No newline at end of file diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation.h new file mode 100644 index 0000000..67a25e5 --- /dev/null +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grucell_activation.h @@ -0,0 +1,52 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#ifndef _VSI_NN_OP_GRUCELL_ACTIVATION_H +#define _VSI_NN_OP_GRUCELL_ACTIVATION_H + +#include "vsi_nn_types.h" + +enum { + GRUCELL_ACT_IN_H_STATE = 0, + GRUCELL_ACT_IN_INPUT_FC_H = 1, + GRUCELL_ACT_IN_H_T = 2, + GRUCELL_ACT_IN_Z_T = 3, + + GRUCELL_ACT_IN_CNT, + + GRUCELL_ACT_OUT_OUTPUT = 0, + GRUCELL_ACT_OUT_H_STATE = 1, + + GRUCELL_ACT_OUT_CNT +}; + +typedef struct _vsi_nn_grucell_activation_param +{ + struct _vsi_nn_grucell_activation_local * local; + + vsi_nn_activation_e activation; +} vsi_nn_grucell_activation_param; +_compiler_assert(offsetof(vsi_nn_grucell_activation_param, local) == 0, \ + vsi_nn_grucell_activation_h ); + +#endif \ No newline at end of file diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h index c06bf7d..035320a 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process.h @@ -61,7 +61,7 @@ typedef struct _vsi_nn_pre_process_param struct { - uint32_t *size; + vsi_size_t *size; uint32_t dim_num; } output_attr; diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h index ee246b3..6b7add6 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_bgra.h @@ -47,7 +47,7 @@ typedef struct _vsi_nn_pre_process_bgra_param struct { - uint32_t *size; + vsi_size_t *size; uint32_t dim_num; } output_attr; diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_gray.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_gray.h index 57abf78..459e25d 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_gray.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_gray.h @@ -56,7 +56,7 @@ typedef struct _vsi_nn_pre_process_gray_param struct { - uint32_t *size; + vsi_size_t *size; uint32_t dim_num; } output_attr; diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h index a62bddb..dddee8d 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_nv12.h @@ -53,7 +53,7 @@ typedef struct _vsi_nn_pre_process_nv12_param struct { - uint32_t *size; + vsi_size_t *size; uint32_t dim_num; } output_attr; diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h index f62bfe6..63e9335 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_rgb.h @@ -59,7 +59,7 @@ typedef struct _vsi_nn_pre_process_rgb_param struct { - uint32_t *size; + vsi_size_t *size; uint32_t dim_num; } output_attr; diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv420.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv420.h index 149520a..998de5e 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv420.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv420.h @@ -53,7 +53,7 @@ typedef struct _vsi_nn_pre_process_yuv420_param struct { - uint32_t *size; + vsi_size_t *size; uint32_t dim_num; } output_attr; diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv444.h b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv444.h index fec700f..c439177 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv444.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_pre_process_yuv444.h @@ -53,7 +53,7 @@ typedef struct _vsi_nn_pre_process_yuv444_param struct { - uint32_t *size; + vsi_size_t *size; uint32_t dim_num; } output_attr; diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_reshape.h b/src/tim/vx/internal/include/ops/vsi_nn_op_reshape.h index 1b5ca0b..a41377a 100644 --- a/src/tim/vx/internal/include/ops/vsi_nn_op_reshape.h +++ b/src/tim/vx/internal/include/ops/vsi_nn_op_reshape.h @@ -37,7 +37,7 @@ typedef struct _vsi_nn_reshape_lcl_data typedef struct _vsi_nn_reshape_param { - const uint32_t * size; + const vsi_size_t * size; uint32_t dim_num; /* reshape layer local data structure */ diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util.h b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util.h index 62b8e8b..973f2ac 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util.h @@ -174,31 +174,31 @@ OVXLIB_API vsi_status vsi_nn_Float32ToDtype const vsi_nn_dtype_t * dst_dtype ); -OVXLIB_API int32_t vsi_nn_DtypeConvertRawData +OVXLIB_API vsi_size_t vsi_nn_DtypeConvertRawData ( uint8_t * src, - int32_t src_bytes, + vsi_size_t src_bytes, const vsi_nn_dtype_t * src_dtype, uint8_t * dst, - int32_t dst_bytes, + vsi_size_t dst_bytes, const vsi_nn_dtype_t * dst_dtype ); -OVXLIB_API int32_t vsi_nn_DtypeConvertRawDataToFloat32 +OVXLIB_API vsi_size_t vsi_nn_DtypeConvertRawDataToFloat32 ( uint8_t * src, - int32_t src_bytes, + vsi_size_t src_bytes, const vsi_nn_dtype_t * src_dtype, float * dst, - int32_t dst_size + vsi_size_t dst_size ); -OVXLIB_API int32_t vsi_nn_DtypeConvertFloat32ToRawData +OVXLIB_API vsi_size_t vsi_nn_DtypeConvertFloat32ToRawData ( float * src, - int32_t src_size, + vsi_size_t src_size, uint8_t * dst, - int32_t dst_bytes, + vsi_size_t dst_bytes, const vsi_nn_dtype_t * dst_dtype ); @@ -221,7 +221,7 @@ OVXLIB_API vsi_status vsi_nn_vxConvertTensorToFloat32Data vx_tensor tensor, vsi_nn_tensor_attr_t *attr, float *f32_data, - uint32_t f32_data_sz + vsi_size_t f32_data_sz ); OVXLIB_API vsi_status vsi_nn_vxConvertFloat32DataToTensor @@ -230,7 +230,7 @@ OVXLIB_API vsi_status vsi_nn_vxConvertFloat32DataToTensor vx_tensor tensor, vsi_nn_tensor_attr_t *attr, float *f32_data, - uint32_t f32_data_sz + vsi_size_t f32_data_sz ); #ifdef __cplusplus diff --git a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h index 334c7a0..f017fe3 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_dtype_util_prv.h @@ -507,7 +507,7 @@ vsi_bool vsi_nn_dtype_convert_float_to_quantize_asymm8 vsi_bool vsi_nn_dtype_convert_float_to_quantize_symm8_perchannel ( const float * buffer, size_t size, - const int32_t * shape, size_t rank, + const vsi_size_t * shape, size_t rank, const float * scale, size_t scale_size, const int32_t * zero_point, size_t zero_point_size, int32_t channel_dim, @@ -552,7 +552,7 @@ vsi_bool vsi_nn_dtype_convert_quantize_asymm8_to_float vsi_bool vsi_nn_dtype_convert_quantize_symm8_perchannel_to_float ( const int8_t * buffer, size_t size, - const int32_t * shape, size_t rank, + const vsi_size_t * shape, size_t rank, const float * scale, size_t scale_size, const int32_t * zero_point, size_t zero_point_size, int32_t channel_dim, diff --git a/src/tim/vx/internal/include/utils/vsi_nn_math.h b/src/tim/vx/internal/include/utils/vsi_nn_math.h index e53066b..14fc5b5 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_math.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_math.h @@ -68,6 +68,7 @@ extern "C" { } DEFINE_ARRAY_TYPE( int, int32_t ) DEFINE_ARRAY_TYPE( float, float ) +DEFINE_ARRAY_TYPE( size, vsi_size_t ) #undef DEFINE_ARRAY_TYPE @@ -75,9 +76,9 @@ OVXLIB_API void vsi_nn_Transpose ( uint8_t * dst, uint8_t * data, - uint32_t * shape, - uint32_t dim_num, - uint32_t * perm, + vsi_size_t * shape, + vsi_size_t dim_num, + vsi_size_t * perm, vsi_nn_type_e type ); @@ -85,38 +86,38 @@ OVXLIB_API void vsi_nn_Permute ( uint8_t * dst, uint8_t * data, - uint32_t * shape, - uint32_t dim_num, - uint32_t * perm, + vsi_size_t * shape, + vsi_size_t dim_num, + vsi_size_t * perm, vsi_nn_type_e type ); OVXLIB_API void vsi_nn_SqueezeShape ( - uint32_t * shape, - uint32_t * dim_num + vsi_size_t * shape, + vsi_size_t * dim_num ); -OVXLIB_API uint32_t vsi_nn_ShapeProduct +OVXLIB_API vsi_size_t vsi_nn_ShapeProduct ( - uint32_t * shape, - uint32_t dim_num + vsi_size_t * shape, + vsi_size_t dim_num ); //shape: row first <--> column first OVXLIB_API void vsi_nn_InvertShape ( - uint32_t * in, - uint32_t dim_num, - uint32_t * out + vsi_size_t * in, + vsi_size_t dim_num, + vsi_size_t * out ); //Permute shape: row first <--> column first OVXLIB_API void vsi_nn_InvertPermuteShape ( - uint32_t * in, - uint32_t dim_num, - uint32_t * out + vsi_size_t * in, + vsi_size_t dim_num, + vsi_size_t * out ); OVXLIB_API double vsi_nn_Rint @@ -131,7 +132,7 @@ OVXLIB_API double vsi_nn_Rint * @param[in] the low uint32_t of the seed. * @param[in] the high uint32_t of the seed. */ -OVXLIB_API void vsi_nn_random_init_for_philox_4x32_10 +void vsi_nn_random_init_for_philox_4x32_10 ( uint32_t low, uint32_t high @@ -144,7 +145,7 @@ OVXLIB_API void vsi_nn_random_init_for_philox_4x32_10 * @param[out] the buffer for RNG output. * @param[in] the number of generated random numbers. */ -OVXLIB_API void vsi_nn_random_generate_by_philox_4x32_10 +void vsi_nn_random_generate_by_philox_4x32_10 ( uint32_t *random_buf, uint32_t len @@ -158,7 +159,7 @@ OVXLIB_API void vsi_nn_random_generate_by_philox_4x32_10 * @param[out] the buffer for uniform float in [0, 1). * @param[in] the number of random numbers. */ -OVXLIB_API void vsi_nn_random_uniform_transform +void vsi_nn_random_uniform_transform ( uint32_t *random_buf, float *uniform_buf, diff --git a/src/tim/vx/internal/include/utils/vsi_nn_shape_util.h b/src/tim/vx/internal/include/utils/vsi_nn_shape_util.h index 51c0c27..f7eaae0 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_shape_util.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_shape_util.h @@ -26,18 +26,19 @@ #include #include +#include "vsi_nn_types.h" void vsi_nn_shape_get_stride ( - const int32_t * shape, - size_t rank, - size_t * out_stride + const vsi_size_t * shape, + vsi_size_t rank, + vsi_size_t * out_stride ); -size_t vsi_nn_shape_get_size +vsi_size_t vsi_nn_shape_get_size ( - const int32_t * shape, - size_t rank + const vsi_size_t * shape, + vsi_size_t rank ); #endif diff --git a/src/tim/vx/internal/include/utils/vsi_nn_util.h b/src/tim/vx/internal/include/utils/vsi_nn_util.h index de9d470..1c8e36d 100644 --- a/src/tim/vx/internal/include/utils/vsi_nn_util.h +++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h @@ -72,27 +72,27 @@ extern "C" { OVXLIB_API uint8_t * vsi_nn_LoadBinaryData ( const char * filename, - uint32_t * sz + vsi_size_t * sz ); -OVXLIB_API uint32_t vsi_nn_GetStrideSize +OVXLIB_API vsi_size_t vsi_nn_GetStrideSize ( vsi_nn_tensor_attr_t * attr, - uint32_t * stride + vsi_size_t * stride ); -OVXLIB_API uint32_t vsi_nn_GetStrideSizeBySize +OVXLIB_API vsi_size_t vsi_nn_GetStrideSizeBySize ( - uint32_t * size, - uint32_t dim_num, + vsi_size_t * size, + vsi_size_t dim_num, vsi_nn_type_e type, - uint32_t * stride + vsi_size_t * stride ); -OVXLIB_API uint32_t vsi_nn_GetTotalBytesBySize +OVXLIB_API vsi_size_t vsi_nn_GetTotalBytesBySize ( - uint32_t * size, - uint32_t dim_num, + vsi_size_t * size, + vsi_size_t dim_num, vsi_nn_type_e type ); @@ -116,10 +116,10 @@ OVXLIB_API void vsi_nn_UpdateTensorDims vsi_nn_tensor_attr_t * attr ); -OVXLIB_API uint32_t vsi_nn_ComputeFilterSize +OVXLIB_API vsi_size_t vsi_nn_ComputeFilterSize ( - uint32_t i_size, - uint32_t ksize, + vsi_size_t i_size, + vsi_size_t ksize, uint32_t * pad, uint32_t stride, uint32_t dilation, @@ -134,24 +134,24 @@ OVXLIB_API void vsi_nn_InitTensorsId OVXLIB_API void vsi_nn_ComputePadWithPadType ( - uint32_t * in_shape, + vsi_size_t * in_shape, uint32_t in_dim_num, - uint32_t * ksize, + vsi_size_t * ksize, uint32_t * stride, vsi_nn_pad_e pad_type, vsi_nn_round_type_e rounding, - uint32_t * out_pad + vsi_size_t * out_pad ); OVXLIB_API void vsi_nn_ComputePadWithPadTypeForConv1D ( - uint32_t * in_shape, + vsi_size_t * in_shape, uint32_t in_dim_num, - uint32_t * ksize, + vsi_size_t * ksize, uint32_t * stride, vsi_nn_pad_e pad_type, vsi_nn_round_type_e rounding, - uint32_t * out_pad + vsi_size_t * out_pad ); OVXLIB_API void vsi_nn_GetPadForOvx @@ -171,8 +171,8 @@ OVXLIB_API vsi_bool vsi_nn_CreateTensorGroup OVXLIB_API uint32_t vsi_nn_ShapeToString ( - uint32_t * shape, - uint32_t dim_num, + vsi_size_t * shape, + vsi_size_t dim_num, char * buf, uint32_t buf_sz, vsi_bool for_print @@ -207,9 +207,9 @@ OVXLIB_API vsi_bool vsi_nn_CheckFilePath */ OVXLIB_API uint8_t * vsi_nn_MallocAlignedBuffer ( - uint32_t mem_size, - uint32_t align_start_size, - uint32_t align_block_size + vsi_size_t mem_size, + vsi_size_t align_start_size, + vsi_size_t align_block_size ); /** @@ -227,14 +227,14 @@ OVXLIB_API void vsi_nn_FreeAlignedBuffer OVXLIB_API vsi_bool vsi_nn_IsBufferAligned ( uint8_t * buf, - uint32_t align_start_size + vsi_size_t align_start_size ); OVXLIB_API void vsi_nn_FormatToString ( vsi_nn_tensor_t *tensor, char *buf, - uint32_t buf_sz + vsi_size_t buf_sz ); OVXLIB_API const char* vsi_nn_DescribeStatus @@ -242,51 +242,33 @@ OVXLIB_API const char* vsi_nn_DescribeStatus vsi_status status ); -uint32_t vsi_nn_compute_filter_shape +vsi_size_t vsi_nn_compute_filter_shape ( vsi_nn_pad_e padding_type, - uint32_t image_size, - uint32_t ksize, + vsi_size_t image_size, + vsi_size_t ksize, uint32_t stride, uint32_t dilation_rate ); void vsi_nn_compute_padding ( - uint32_t * in_shape, - uint32_t * ksize, + vsi_size_t * in_shape, + vsi_size_t * ksize, uint32_t * stride, uint32_t * dilation, vsi_nn_pad_e pad_type, - uint32_t * out_pad + vsi_size_t * out_pad ); void vsi_nn_compute_padding_conv1d ( - uint32_t * in_shape, - uint32_t * ksize, + vsi_size_t * in_shape, + vsi_size_t * ksize, uint32_t * stride, uint32_t * dilation, vsi_nn_pad_e pad_type, - uint32_t * out_pad - ); - -void vsi_nn_OptimizedEltOPShape - ( - vsi_nn_tensor_t * input, - uint32_t sizes[VSI_NN_MAX_DIM_NUM], - uint32_t * num_of_dims - ); - -vsi_bool vsi_nn_OptimizedEltWiseOPShape - ( - vsi_nn_tensor_t * input0, - vsi_nn_tensor_t * input1, - vsi_nn_tensor_t * output, - uint32_t sizes0[VSI_NN_MAX_DIM_NUM], - uint32_t sizes1[VSI_NN_MAX_DIM_NUM], - uint32_t sizes2[VSI_NN_MAX_DIM_NUM], - uint32_t * dim_num + vsi_size_t * out_pad ); vsi_bool vsi_nn_IsEVISFeatureAvaiable @@ -317,7 +299,7 @@ typedef uint32_t(*comp_func)(void* data, int32_t left, int32_t right); * @param[in] recursively execute vsi_nn_partition. * @param[out] the sorted index of data. */ -OVXLIB_API int32_t vsi_nn_partition +int32_t vsi_nn_partition ( void* data, int32_t left, @@ -350,7 +332,7 @@ static inline void vsi_nn_reorder_tensor } } -void vsi_nn_print_int_array( int32_t* array, size_t size ); +void vsi_nn_print_size_array( vsi_size_t* array, size_t size ); float vsi_nn_activation ( diff --git a/src/tim/vx/internal/include/vsi_nn_internal_node.h b/src/tim/vx/internal/include/vsi_nn_internal_node.h index e314f7d..c1124b5 100644 --- a/src/tim/vx/internal/include/vsi_nn_internal_node.h +++ b/src/tim/vx/internal/include/vsi_nn_internal_node.h @@ -88,6 +88,7 @@ vsi_nn_internal_tensor_t* vsi_nn_internal_create_zero_bias_tensor vsi_nn_node_t* node, vsi_nn_tensor_attr_t* input_attr, vsi_nn_tensor_attr_t* weight_attr, + vsi_nn_op_t op, vsi_bool use_virtual_tensor ); @@ -132,8 +133,8 @@ vsi_nn_internal_node_t* vsi_nn_internal_new_node ( vsi_nn_node_t* node, vsi_nn_op_t op, - uint32_t input_num, - uint32_t output_num + vsi_size_t input_num, + vsi_size_t output_num ); void* vsi_nn_internal_new_node_param diff --git a/src/tim/vx/internal/include/vsi_nn_node.h b/src/tim/vx/internal/include/vsi_nn_node.h index 9b3e302..b922204 100644 --- a/src/tim/vx/internal/include/vsi_nn_node.h +++ b/src/tim/vx/internal/include/vsi_nn_node.h @@ -117,8 +117,8 @@ OVXLIB_API vsi_nn_node_t * vsi_nn_NewNode ( vsi_nn_graph_t * graph, vsi_nn_op_t op, - uint32_t input_num, - uint32_t output_num + vsi_size_t input_num, + vsi_size_t output_num ); /** diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h index b490601..a6830f6 100644 --- a/src/tim/vx/internal/include/vsi_nn_node_type.h +++ b/src/tim/vx/internal/include/vsi_nn_node_type.h @@ -172,6 +172,11 @@ #include "ops/vsi_nn_op_grouped_conv1d.h" #include "ops/vsi_nn_op_scatter_nd_update.h" #include "ops/vsi_nn_op_gelu.h" +#include "ops/vsi_nn_op_conv2d_lstm.h" +#include "ops/vsi_nn_op_conv2d_lstm_cell.h" +#include "ops/vsi_nn_op_gru.h" +#include "ops/vsi_nn_op_grucell.h" +#include "ops/vsi_nn_op_grucell_activation.h" /* custom node head define define */ #include "custom/vsi_nn_custom_node_type.h" @@ -330,6 +335,11 @@ typedef union _vsi_nn_nn_param vsi_nn_grouped_conv1d_param grouped_conv1d; vsi_nn_scatter_nd_update_param scatter_nd_update; vsi_nn_gelu_param gelu; + vsi_nn_conv2d_lstm_param conv2d_lstm; + vsi_nn_conv2d_lstm_cell_param conv2d_lstm_cell; + vsi_nn_gru_param gru; + vsi_nn_grucell_param grucell; + vsi_nn_grucell_activation_param grucell_activation; uint8_t client_param[128]; /* custom node data struct define */ diff --git a/src/tim/vx/internal/include/vsi_nn_ops.h b/src/tim/vx/internal/include/vsi_nn_ops.h index 23a750d..4c79499 100644 --- a/src/tim/vx/internal/include/vsi_nn_ops.h +++ b/src/tim/vx/internal/include/vsi_nn_ops.h @@ -280,8 +280,8 @@ void vsi_nn_OpGetIoNum ( vsi_nn_op_t op, vsi_nn_node_t * node, - uint32_t * input_num, - uint32_t * output_num + vsi_size_t * input_num, + vsi_size_t * output_num ); OVXLIB_API vsi_bool vsi_nn_OpGenerateTensor diff --git a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h index 74938f7..386123c 100644 --- a/src/tim/vx/internal/include/vsi_nn_pre_post_process.h +++ b/src/tim/vx/internal/include/vsi_nn_pre_post_process.h @@ -181,6 +181,7 @@ vsi_status vsi_nn_add_single_preproc_node ( vsi_nn_graph_t* graph, uint32_t input_idx, + vsi_nn_tensor_id_t input, vsi_nn_node_t** first_node, uint32_t nodes_count, vsi_nn_preprocess_base_t* preprocess, @@ -234,7 +235,6 @@ OVXLIB_API vsi_status vsi_nn_AddGraphPostProcess uint32_t count ); - #ifdef __cplusplus } #endif diff --git a/src/tim/vx/internal/include/vsi_nn_rnn_helper.h b/src/tim/vx/internal/include/vsi_nn_rnn_helper.h index e9191ca..4bef7b9 100644 --- a/src/tim/vx/internal/include/vsi_nn_rnn_helper.h +++ b/src/tim/vx/internal/include/vsi_nn_rnn_helper.h @@ -223,8 +223,8 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_reshape vsi_nn_node_t* self, vsi_nn_tensor_t* input_tensor, vsi_nn_tensor_t* output_tensor, - uint32_t* size, - uint32_t dim_num, + vsi_size_t* size, + vsi_size_t dim_num, vsi_bool use_virtual_tensor ); @@ -233,8 +233,8 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_permute vsi_nn_node_t* self, vsi_nn_tensor_t* input_tensor, vsi_nn_tensor_t* output_tensor, - uint32_t* perm, - uint32_t dim_num, + vsi_size_t* perm, + vsi_size_t dim_num, vsi_bool use_virtual_tensor ); diff --git a/src/tim/vx/internal/include/vsi_nn_tensor.h b/src/tim/vx/internal/include/vsi_nn_tensor.h index 4dcde2c..846054f 100644 --- a/src/tim/vx/internal/include/vsi_nn_tensor.h +++ b/src/tim/vx/internal/include/vsi_nn_tensor.h @@ -132,7 +132,7 @@ typedef struct vsi_nn_dtype typedef struct vsi_nn_tensor_attr { /** Tensor shape */ - uint32_t size[VSI_NN_MAX_DIM_NUM]; + vsi_size_t size[VSI_NN_MAX_DIM_NUM]; /** Dimension number */ uint32_t dim_num; /** If it's virtual tensor*/ diff --git a/src/tim/vx/internal/include/vsi_nn_tensor_util.h b/src/tim/vx/internal/include/vsi_nn_tensor_util.h index bf39d49..a88864d 100644 --- a/src/tim/vx/internal/include/vsi_nn_tensor_util.h +++ b/src/tim/vx/internal/include/vsi_nn_tensor_util.h @@ -183,10 +183,10 @@ OVXLIB_API uint8_t * vsi_nn_ConvertRawTensorToData ( vx_context context, vx_tensor tensor, - uint32_t * dim, + vsi_size_t * dim, vx_enum * data_format, - uint32_t * size, - uint32_t * stride_size, + vsi_size_t * size, + vsi_size_t * stride_size, vx_tensor_addressing * addr, vx_enum accessor ); @@ -211,7 +211,7 @@ OVXLIB_API uint8_t * vsi_nn_ConvertRawTensorToData2 vx_context context, vx_tensor tensor, vsi_nn_tensor_attr_t * attr, - uint32_t * stride_size, + vsi_size_t * stride_size, vx_tensor_addressing * addr, vx_enum accessor ); @@ -266,7 +266,7 @@ OVXLIB_API void vsi_nn_SaveDataToText ( const char * filename, uint8_t * data, - uint32_t data_size, + vsi_size_t data_size, vsi_nn_type_e data_format, char * seperator ); @@ -356,7 +356,7 @@ OVXLIB_API vsi_status vsi_nn_CopyRawDataToTensor vsi_nn_tensor_t* tensor ); -OVXLIB_API uint32_t vsi_nn_CopyTensorToBuffer +OVXLIB_API vsi_size_t vsi_nn_CopyTensorToBuffer ( vsi_nn_graph_t * graph, vsi_nn_tensor_t * tensor, @@ -394,25 +394,25 @@ OVXLIB_API void vsi_nn_TransposeTensor ( vsi_nn_graph_t * graph, vsi_nn_tensor_t * tensor, - uint32_t * perm, - uint32_t dim_num, - uint32_t * as_shape + vsi_size_t * perm, + vsi_size_t dim_num, + vsi_size_t * as_shape ); OVXLIB_API void vsi_nn_PermuteTensor ( vsi_nn_graph_t * graph, vsi_nn_tensor_t * tensor, - uint32_t * perm, - uint32_t dim_num + vsi_size_t * perm, + vsi_size_t dim_num ); OVXLIB_API vsi_bool vsi_nn_CalcReshapeTensor ( vsi_nn_tensor_t * input, vsi_nn_tensor_t * output, - uint32_t * shape, - uint32_t dim_num + vsi_size_t * shape, + vsi_size_t dim_num ); OVXLIB_API vsi_bool vsi_nn_ReshapeTensor @@ -420,8 +420,8 @@ OVXLIB_API vsi_bool vsi_nn_ReshapeTensor vsi_nn_graph_t * graph, vsi_nn_tensor_t * input, vsi_nn_tensor_t * output, - const uint32_t * shape, - uint32_t dim_num + const vsi_size_t * shape, + vsi_size_t dim_num ); /** @@ -430,7 +430,7 @@ OVXLIB_API vsi_bool vsi_nn_ReshapeTensor * @param[in] tensor Tensor handle. * @return Element number of the tensor. */ -OVXLIB_API uint32_t vsi_nn_GetElementNum +OVXLIB_API vsi_size_t vsi_nn_GetElementNum ( const vsi_nn_tensor_t * tensor ); @@ -446,10 +446,10 @@ OVXLIB_API uint32_t vsi_nn_GetElementNum * * @return Size of the tensor. */ -OVXLIB_API uint32_t vsi_nn_GetTensorSize +OVXLIB_API vsi_size_t vsi_nn_GetTensorSize ( - const uint32_t * shape, - uint32_t dim_num, + const vsi_size_t * shape, + vsi_size_t dim_num, vsi_nn_type_e dtype ); @@ -507,8 +507,8 @@ OVXLIB_API void vsi_nn_Free OVXLIB_API vx_tensor vsi_nn_CreateViewTensor ( vsi_nn_graph_t *graph, - uint32_t *start, - uint32_t *end, + vsi_size_t *start, + vsi_size_t *end, vsi_nn_tensor_t *tensor ); @@ -536,7 +536,7 @@ OVXLIB_API vsi_status vsi_nn_SwapTensorHandle vsi_nn_tensor_t * tensor1 ); -OVXLIB_API uint32_t vsi_nn_vxGetTensorElementNum +OVXLIB_API vsi_size_t vsi_nn_vxGetTensorElementNum ( vsi_nn_tensor_attr_t *attr ); @@ -571,7 +571,7 @@ OVXLIB_API vsi_status vsi_nn_vxCopyDataToTensor * * @return the offset from the beginning of the tensor(offset unit: element) */ -OVXLIB_API uint32_t vsi_nn_GetOffsetByCoords +OVXLIB_API vsi_size_t vsi_nn_GetOffsetByCoords ( vsi_nn_tensor_attr_t *attr, uint32_t *coords @@ -621,8 +621,8 @@ vsi_nn_tensor_t *vsi_nn_reshape_tensor ( vsi_nn_graph_t * graph, vsi_nn_tensor_t * input, - uint32_t * shape, - uint32_t dim_num + vsi_size_t * shape, + vsi_size_t dim_num ); /** @@ -646,9 +646,9 @@ vsi_status vsi_nn_copy_tensor_veiw_patch vx_tensor tensor, vsi_nn_tensor_attr_t *attr, void *user_ptr, - uint32_t *start, - uint32_t *end, - uint32_t *stride, + vsi_size_t *start, + vsi_size_t *end, + vsi_size_t *stride, vsi_enum usage, vsi_enum user_memory_type ); diff --git a/src/tim/vx/internal/include/vsi_nn_types.h b/src/tim/vx/internal/include/vsi_nn_types.h index cb92928..6e082f8 100644 --- a/src/tim/vx/internal/include/vsi_nn_types.h +++ b/src/tim/vx/internal/include/vsi_nn_types.h @@ -37,6 +37,39 @@ extern "C"{ #define inline __inline #endif +#if VX_VA40_EXT_SUPPORT +#define VSI_40BIT_VA_SUPPORT +#endif + +#if (defined(_MSC_VER) || defined(__MINGW32)) + #define SIZE_T_SPECIFIER "Iu" + #define SSIZE_T_SPECIFIER "Id" + #ifdef VSI_40BIT_VA_SUPPORT + #define VSI_SIZE_T_SPECIFIER "Iu" + #define VSI_SSIZE_T_SPECIFIER "Id" + #else + #define VSI_SIZE_T_SPECIFIER "u" + #define VSI_SSIZE_T_SPECIFIER "d" + #endif +#else + #define SIZE_T_SPECIFIER "zu" + #define SSIZE_T_SPECIFIER "zd" + #ifdef VSI_40BIT_VA_SUPPORT + #define VSI_SIZE_T_SPECIFIER "zu" + #define VSI_SSIZE_T_SPECIFIER "zd" + #else + #define VSI_SIZE_T_SPECIFIER "u" + #define VSI_SSIZE_T_SPECIFIER "d" + #endif +#endif + +#if defined(_MSC_VER) +#include +typedef SSIZE_T ssize_t; +#else +#include +#endif + /** Enumuration type */ typedef int32_t vsi_enum; /** Status type */ @@ -47,6 +80,16 @@ typedef int32_t vsi_bool; typedef uint16_t vsi_float16; /** Truncate float16 */ typedef uint16_t vsi_bfloat16; +/** Tensor size */ +#ifdef VSI_40BIT_VA_SUPPORT +typedef size_t vsi_size_t; +typedef ssize_t vsi_ssize_t; +#else +typedef uint32_t vsi_size_t; +typedef int32_t vsi_ssize_t; +#endif + +#define VSI_SIZE_T #ifndef TRUE #define TRUE 1 @@ -180,6 +223,12 @@ typedef enum _vsi_nn_node_attr_preload_type_e VSI_NN_NODE_PRELOAD_AXISRAM } vsi_nn_node_attr_preload_type_e; +typedef enum _vsi_nn_con2d_lstm_dataformat +{ + CONV2D_LSTM_CHANNELS_LAST, + CONV2D_LSTM_CHANNELS_FIRST +} vsi_nn_con2d_lstm_dataformat; + /** Deprecated */ typedef uint32_t vsi_nn_size_t; diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h index 1e7123d..db3ba86 100644 --- a/src/tim/vx/internal/include/vsi_nn_version.h +++ b/src/tim/vx/internal/include/vsi_nn_version.h @@ -33,7 +33,7 @@ extern "C"{ #define VSI_NN_VERSION_MAJOR 1 #define VSI_NN_VERSION_MINOR 1 -#define VSI_NN_VERSION_PATCH 33 +#define VSI_NN_VERSION_PATCH 34 #define VSI_NN_VERSION \ (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH) diff --git a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c index f60ae11..0ec7145 100644 --- a/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c +++ b/src/tim/vx/internal/src/custom/ops/kernel/evis/custom_softmax_evis.c @@ -80,7 +80,7 @@ DEF_KERNEL_INITIALIZER(_softmax_initializer) return status; } - sf_size = attr->shape->data[0]; + sf_size = (int)attr->shape->data[0]; gpu_param.global_offset[0] = 0; gpu_param.global_offset[1] = 0; diff --git a/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c b/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c index 3aa9835..3a37247 100644 --- a/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c +++ b/src/tim/vx/internal/src/custom/ops/op_custom_softmax.c @@ -76,7 +76,7 @@ static vsi_bool op_setup { outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; memmove(outputs[0]->attr.size, inputs[0]->attr.size, - inputs[0]->attr.dim_num * sizeof(uint32_t)); + inputs[0]->attr.dim_num * sizeof(vsi_size_t)); } return TRUE; } /* op_setup() */ diff --git a/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c b/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c index 0e94576..8036d0e 100644 --- a/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/add_mean_std_norm_cl.c @@ -111,7 +111,7 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer) }; vx_tensor input0 = (vx_tensor)param[0]; vsi_nn_kernel_tensor_attr_t *input0_attr = NULL; - vsi_int_array_t *input_shape = NULL; + vsi_size_array_t *input_shape = NULL; input0_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input0); CHECK_PTR_FAIL_GOTO( input0_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c index 5855db8..bf5b07c 100644 --- a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c @@ -141,7 +141,7 @@ DEF_KERNEL_INITIALIZER(_argmax_initializer) vsi_status status = VSI_FAILURE; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -238,13 +238,13 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; int32_t axis = 0; - int32_t axis_size = 0; + vsi_size_t axis_size = 0; axis = vsi_nn_kernel_param_get_int32(params, "axis"); - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) || axis > 2) { diff --git a/src/tim/vx/internal/src/kernel/cl/argmin_cl.c b/src/tim/vx/internal/src/kernel/cl/argmin_cl.c index 399e496..2911a84 100644 --- a/src/tim/vx/internal/src/kernel/cl/argmin_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/argmin_cl.c @@ -141,7 +141,7 @@ DEF_KERNEL_INITIALIZER(_argmin_initializer) vsi_status status = VSI_FAILURE; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -233,13 +233,13 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; int32_t axis = 0; - int32_t axis_size = 0; + size_t axis_size = 0; axis = vsi_nn_kernel_param_get_int32(params, "axis"); - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) || axis > 2) { diff --git a/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c b/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c index 3278730..2223eb9 100644 --- a/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/batchnorm_single_cl.c @@ -55,11 +55,6 @@ __BEGIN_DECLS HASH_BATCH_NORM_SH_KERNEL_NAME( SRC_TYPE, OUT_TYPE), \ VSI_NN_GEN_BATCH_NORM_KERNEL_SOURCE_NAME }, -#define TENSOR_BATCH_NORM_FLOAT( SRC_TYPE, OUT_TYPE) \ - { HASH_BATCH_NORM_KEY( SRC_TYPE, OUT_TYPE, 0), \ - HASH_BATCH_NORM_SH_KERNEL_NAME( F32, F32), \ - VSI_NN_GEN_BATCH_NORM_KERNEL_SOURCE_NAME }, - #define HASH_BATCH_NORM_SH_KERNEL_2D_NAME( SRC_TYPE, DST_TYPE) \ CVIVANTE_NAMESPACE("batch_norm_"#SRC_TYPE"to"#DST_TYPE"_2D") @@ -68,40 +63,29 @@ __BEGIN_DECLS HASH_BATCH_NORM_SH_KERNEL_2D_NAME( SRC_TYPE, OUT_TYPE), \ VSI_NN_GEN_BATCH_NORM_KERNEL_SOURCE_NAME }, -#define TENSOR_BATCH_NORM_FLOAT_2D( SRC_TYPE, OUT_TYPE) \ - { HASH_BATCH_NORM_KEY( SRC_TYPE, OUT_TYPE, 1), \ - HASH_BATCH_NORM_SH_KERNEL_2D_NAME( F32, F32), \ - VSI_NN_GEN_BATCH_NORM_KERNEL_SOURCE_NAME }, - static const struct { uint32_t key; char* function_name; const char* source_name; } kernel_map[] = { - TENSOR_BATCH_NORM_FLOAT(F32, F32) - TENSOR_BATCH_NORM_FLOAT(F32, F32) - TENSOR_BATCH_NORM_FLOAT(F32, F32) - TENSOR_BATCH_NORM_FLOAT(F16, F16) - TENSOR_BATCH_NORM_FLOAT(F16, F16) - TENSOR_BATCH_NORM_FLOAT(F16, F16) + TENSOR_BATCH_NORM_KERNELS(F32, F32) + TENSOR_BATCH_NORM_KERNELS(F32, U8) + TENSOR_BATCH_NORM_KERNELS(F32, I32) - TENSOR_BATCH_NORM_FLOAT_2D(F32, F32) - TENSOR_BATCH_NORM_FLOAT_2D(F32, F32) - TENSOR_BATCH_NORM_FLOAT_2D(F16, F16) - TENSOR_BATCH_NORM_FLOAT_2D(F16, F16) + TENSOR_BATCH_NORM_KERNELS_2D(F32, F32) + TENSOR_BATCH_NORM_KERNELS_2D(F32, U8) + TENSOR_BATCH_NORM_KERNELS_2D(F32, I32) - TENSOR_BATCH_NORM_KERNELS(U8, U8) - TENSOR_BATCH_NORM_KERNELS(U8, U8) - TENSOR_BATCH_NORM_KERNELS(U8, U8) - TENSOR_BATCH_NORM_KERNELS(U8, U8) - TENSOR_BATCH_NORM_KERNELS(U8, U8) - TENSOR_BATCH_NORM_KERNELS(U8, U8) + TENSOR_BATCH_NORM_KERNELS(U8, U8) + TENSOR_BATCH_NORM_KERNELS(U8, F32) + TENSOR_BATCH_NORM_KERNELS(I32, I32) + TENSOR_BATCH_NORM_KERNELS(I32, F32) - TENSOR_BATCH_NORM_KERNELS_2D(U8, U8) - TENSOR_BATCH_NORM_KERNELS_2D(U8, U8) - TENSOR_BATCH_NORM_KERNELS_2D(U8, U8) - TENSOR_BATCH_NORM_KERNELS_2D(U8, U8) + TENSOR_BATCH_NORM_KERNELS_2D(U8, U8) + TENSOR_BATCH_NORM_KERNELS_2D(U8, F32) + TENSOR_BATCH_NORM_KERNELS_2D(I32, I32) + TENSOR_BATCH_NORM_KERNELS_2D(I32, F32) }; /* @@ -149,7 +133,7 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer) vsi_status status = VSI_FAILURE; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; - vsi_int_array_t * in_shape = NULL; + vsi_size_array_t * in_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -190,6 +174,24 @@ static vsi_status _query_kernel input_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + if (input_dtype == I8 || input_dtype == I16) + { + input_dtype = I32; + } + else if (input_dtype == F16) + { + input_dtype = F32; + } + + if (output_dtype == I8 || output_dtype == I16) + { + output_dtype = I32; + } + else if (output_dtype == F16) + { + output_dtype = F32; + } + key = HASH_BATCH_NORM_KEY( input_dtype, output_dtype, image_2d ); for( i = 0; i < _cnt_of_array(kernel_map); i ++ ) @@ -239,14 +241,36 @@ static vsi_nn_kernel_node_t _setup if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC ) { input_scale = inputs[0]->attr.dtype.scale; - input_tail = 0 - (float)inputs[0]->attr.dtype.zero_point * inputs[0]->attr.dtype.scale; + input_tail = (float)inputs[0]->attr.dtype.zero_point * inputs[0]->attr.dtype.scale; + } + else if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP ) + { + if (inputs[0]->attr.dtype.fl > 0) + { + input_scale = (1.0f / ((float) ((int64_t)1 << inputs[0]->attr.dtype.fl))); + } + else + { + input_scale = ((float) ((int64_t)1 << -inputs[0]->attr.dtype.fl)); + } } if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC ) { - input_scale = 1.0f / outputs[0]->attr.dtype.scale; + output_scale = 1.0f / outputs[0]->attr.dtype.scale; output_zp = (float)outputs[0]->attr.dtype.zero_point + 0.5f; } + else if (outputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_DFP ) + { + if (outputs[0]->attr.dtype.fl > 0) + { + output_scale = (float) ((int64_t)1 << outputs[0]->attr.dtype.fl); + } + else + { + output_scale = ((float) 1.0f / ((int64_t)1 << -outputs[0]->attr.dtype.fl)); + } + } if ( (inputs[1]->attr.is_const && inputs[2]->attr.is_const) || ( inputs[1]->attr.dtype.vx_type != VSI_NN_TYPE_FLOAT16 @@ -262,7 +286,7 @@ static vsi_nn_kernel_node_t _setup return NULL; } - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) ) { diff --git a/src/tim/vx/internal/src/kernel/cl/cast_cl.c b/src/tim/vx/internal/src/kernel/cl/cast_cl.c index 6b578af..33291a7 100644 --- a/src/tim/vx/internal/src/kernel/cl/cast_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/cast_cl.c @@ -126,7 +126,7 @@ DEF_KERNEL_INITIALIZER(_cast_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t * output_attr = NULL; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); @@ -251,7 +251,7 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/clip_cl.c b/src/tim/vx/internal/src/kernel/cl/clip_cl.c index f8ec75d..c611991 100644 --- a/src/tim/vx/internal/src/kernel/cl/clip_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/clip_cl.c @@ -120,7 +120,7 @@ DEF_KERNEL_INITIALIZER(_clip_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t * output_attr = NULL; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); @@ -251,7 +251,7 @@ static vsi_nn_kernel_node_t _setup outputScale = 1.0f / outputScale; inputTail = -(inputTail * inputScale); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c index 62bb0f4..80f2f95 100644 --- a/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/comparisons_cl.c @@ -226,7 +226,7 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer) vsi_status status = VSI_FAILURE; vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -342,7 +342,7 @@ static vsi_nn_kernel_node_t _setup float input1Scale = inputs[1]->attr.dtype.scale; float input1Tail = (float)inputs[1]->attr.dtype.zero_point * input1Scale; - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c b/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c index 697c34e..fdf9b40 100644 --- a/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/detect_post_box_cl.c @@ -124,7 +124,7 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t * input_attr = NULL; - vsi_int_array_t * in_shape = NULL; + vsi_size_array_t * in_shape = NULL; input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c index 50416af..7dde9f8 100644 --- a/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/eltwise_unary_cl.c @@ -225,7 +225,7 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer) vsi_status status = VSI_FAILURE; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -314,8 +314,8 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; vsi_nn_tensor_t* rs_tensors[2] = { NULL }; - int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; - int32_t new_rank = 0; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t new_rank = 0; vsi_bool ret; float inputScale = inputs[0]->attr.dtype.scale; @@ -325,17 +325,17 @@ static vsi_nn_kernel_node_t _setup float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" ); ret = vsi_nn_kernel_optimize_element_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank ); if( ret ) { rs_tensors[0] = vsi_nn_reshape_tensor( graph, - inputs[0], (uint32_t*)shape, new_rank ); + inputs[0], shape, new_rank ); rs_tensors[1] = vsi_nn_reshape_tensor( graph, - outputs[0], (uint32_t*)shape, new_rank ); + outputs[0], shape, new_rank ); } - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( rs_tensors[0]->attr.size, rs_tensors[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/erf_cl.c b/src/tim/vx/internal/src/kernel/cl/erf_cl.c index e817d19..1cd573b 100644 --- a/src/tim/vx/internal/src/kernel/cl/erf_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/erf_cl.c @@ -133,7 +133,7 @@ DEF_KERNEL_INITIALIZER(_erf_initializer) vsi_status status = VSI_FAILURE; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -233,8 +233,8 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_ERF_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; vsi_nn_tensor_t* rs_tensors[2] = { NULL }; - int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; - int32_t new_rank = 0; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t new_rank = 0; vsi_bool ret = FALSE; vsi_bool image_2d = FALSE; @@ -244,17 +244,17 @@ static vsi_nn_kernel_node_t _setup float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f; ret = vsi_nn_kernel_optimize_element_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank ); if ( ret ) { rs_tensors[0] = vsi_nn_reshape_tensor( graph, - inputs[0], (uint32_t*)shape, new_rank ); + inputs[0], shape, new_rank ); rs_tensors[1] = vsi_nn_reshape_tensor( graph, - outputs[0], (uint32_t*)shape, new_rank ); + outputs[0], shape, new_rank ); } - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( rs_tensors[0]->attr.size, rs_tensors[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c index 11029f6..4ceb1c2 100644 --- a/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/floordiv_cl.c @@ -126,7 +126,7 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer) vx_status status = VX_FAILURE; vx_tensor output = (vx_tensor)param[2]; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; - vsi_int_array_t *output_shape = NULL; + vsi_size_array_t *output_shape = NULL; output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); @@ -266,7 +266,7 @@ static vsi_nn_kernel_node_t _setup input0Tail = -(input0Tail * input0Scale); input1Tail = -(input1Tail * input1Scale); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/gather_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_cl.c index 111f66f..aa5e2e5 100644 --- a/src/tim/vx/internal/src/kernel/cl/gather_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c @@ -95,16 +95,16 @@ static vx_param_description_t _gather_kernel_param_def[] = static vsi_status cal_gather_tensor_reshape_size ( vsi_nn_tensor_t ** inputs, - int32_t sizes[VSI_NN_MAX_DIM_NUM], + vsi_size_t sizes[VSI_NN_MAX_DIM_NUM], uint32_t block_size, uint32_t idxFlg ) { vsi_status status = VSI_FAILURE; uint32_t dims_num = inputs[0]->attr.dim_num; - uint32_t *input_size = inputs[0]->attr.size; + vsi_size_t *input_size = inputs[0]->attr.size; uint32_t i = 0; - uint32_t elementCnt = 1; + vsi_size_t elementCnt = 1; #define VSI_NN_MAX_IMAGE_WIDTH (65536) for(i = 0; i < dims_num; ++i) @@ -157,12 +157,12 @@ DEF_KERNEL_INITIALIZER(_gather_initializer) }; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; - vsi_int_array_t * input1_shape = NULL; + vsi_size_array_t * input1_shape = NULL; int32_t block_size = 0; int32_t block_num = 0; - int32_t indices_num = 1; - uint32_t input_dims1 = 0; - vx_uint32 i = 0; + vsi_ssize_t indices_num = 1; + size_t input_dims1 = 0; + size_t i = 0; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -175,7 +175,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer) CHECK_STATUS_FAIL_GOTO(status, final ); input1_shape = attr[1]->shape; - input_dims1 = (uint32_t)input1_shape->size; + input_dims1 = input1_shape->size; for (i = 0; i < input_dims1; i++) { indices_num *= input1_shape->data[i]; @@ -270,7 +270,7 @@ static vsi_nn_kernel_node_t _setup vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t node_params[_GATHER_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; - int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); int32_t block_num = vsi_nn_kernel_param_get_int32( params, "block_num" ); int32_t axis_num = vsi_nn_kernel_param_get_int32( params, "axis_num" ); @@ -284,7 +284,7 @@ static vsi_nn_kernel_node_t _setup return NULL; } - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c index 1927146..af79d59 100644 --- a/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/gather_nd_cl.c @@ -99,7 +99,7 @@ static vx_param_description_t _gather_nd_kernel_param_def[] = static vsi_status cal_gather_nd_tensor_reshape_size ( vsi_nn_tensor_t ** inputs, - int32_t sizes[VSI_NN_MAX_DIM_NUM], + vsi_size_t sizes[VSI_NN_MAX_DIM_NUM], uint32_t block_size, uint32_t coordDim, int32_t* newDim @@ -107,9 +107,9 @@ static vsi_status cal_gather_nd_tensor_reshape_size { vsi_status status = VSI_FAILURE; uint32_t dims_num = inputs[0]->attr.dim_num; - uint32_t *input_size = inputs[0]->attr.size; + vsi_size_t *input_size = inputs[0]->attr.size; uint32_t i = 0; - uint32_t elementCnt = 1; + vsi_size_t elementCnt = 1; #define VSI_NN_MAX_IMAGE_WIDTH (65536) newDim[0] = 0; @@ -181,7 +181,7 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer) vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; int32_t block_size = 0; - int32_t indices_num = 1; + vsi_ssize_t indices_num = 1; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -283,7 +283,7 @@ static vsi_nn_kernel_node_t _setup vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t node_params[_GATHER_ND_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; - int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" ); int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0; @@ -296,7 +296,7 @@ static vsi_nn_kernel_node_t _setup return NULL; } - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c index f4ecf0e..52110c8 100644 --- a/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/group_normalization_cl.c @@ -217,9 +217,9 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer) }; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; - vsi_int_array_t * input_shape = NULL; - int32_t width = 0; - int32_t chn = 0; + vsi_size_array_t * input_shape = NULL; + vsi_ssize_t width = 0; + vsi_ssize_t chn = 0; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -274,7 +274,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_mean_vari_initializer) }; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; - int32_t chn = 0; + vsi_ssize_t chn = 0; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -320,10 +320,10 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer) }; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; - vsi_int_array_t * input_shape = NULL; - int32_t width = 0; - int32_t height = 0; - int32_t chn = 0; + vsi_size_array_t * input_shape = NULL; + vsi_ssize_t width = 0; + vsi_ssize_t height = 0; + vsi_ssize_t chn = 0; int32_t is2D = 0; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); @@ -448,15 +448,15 @@ static vsi_status _query_kernel static int32_t _optimize_gn_shape_cl ( vsi_nn_tensor_t ** inputs, - int32_t group_size, + vsi_size_t group_size, int32_t group_num, - int32_t* opt_shape, + vsi_size_t* opt_shape, int32_t* is2D_flg ) { vsi_status status = VSI_SUCCESS; - int32_t group_shape[VSI_NN_MAX_DIM_NUM] = {0}; - int32_t new_rank = 0; + vsi_size_t group_shape[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t new_rank = 0; group_shape[0] = inputs[0]->attr.size[0]; group_shape[1] = inputs[0]->attr.size[1]; group_shape[2] = group_size; @@ -510,17 +510,17 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL }; vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL }; vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL; - int32_t new_shape[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 }; + vsi_size_t new_shape[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 }; int32_t is2D_flg = 0; uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 }; uint32_t hashkey = 0; int32_t i = 0; float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); int32_t group_num = vsi_nn_kernel_param_get_int32( params, "group_num" ); - int32_t group_size = inputs[0]->attr.size[2] / group_num; + vsi_size_t group_size = inputs[0]->attr.size[2] / group_num; - int32_t width = inputs[0]->attr.size[0]; - int32_t height = inputs[0]->attr.size[1]; + vsi_size_t width = inputs[0]->attr.size[0]; + vsi_size_t height = inputs[0]->attr.size[1]; int32_t group_stride = 1; float input_zp = 0; float input_scale = 1.0f; @@ -531,7 +531,7 @@ static vsi_nn_kernel_node_t _setup float rSpaceOrg = 1.0f / (width * height); float group_ratio = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] * group_size); - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; @@ -547,7 +547,7 @@ static vsi_nn_kernel_node_t _setup width = new_shape[0]; height = is2D_flg > 0 ? 1 : new_shape[1]; - group_stride = ((width + 15) / 16) * 4; + group_stride = (int32_t)(((width + 15) / 16) * 4); if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) { @@ -693,7 +693,7 @@ static vsi_nn_kernel_node_t _setup int32_t pStride = 0; if (!is2D_flg) { - pStride = inputs[1]->attr.size[0] / new_shape[1]; + pStride = (int32_t)(inputs[1]->attr.size[0] / new_shape[1]); rSpaceOrg = 1.0f / (new_shape[0] / pStride); } node_params[index++] = rs_input; diff --git a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c index fe470a0..8b78ced 100644 --- a/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/instance_normalization_cl.c @@ -196,11 +196,11 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer) }; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; - vsi_int_array_t * input_shape = NULL; + vsi_size_array_t * input_shape = NULL; int32_t rsFlg = 0; - int32_t width = 0; - int32_t height = 0; - int32_t chn = 0; + vsi_ssize_t width = 0; + vsi_ssize_t height = 0; + vsi_ssize_t chn = 0; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -263,11 +263,11 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) }; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; - vsi_int_array_t * input_shape = NULL; + vsi_size_array_t * input_shape = NULL; int32_t rsFlg = 0; - int32_t width = 0; - int32_t height = 0; - int32_t chn = 0; + vsi_ssize_t width = 0; + vsi_ssize_t height = 0; + vsi_ssize_t chn = 0; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -410,9 +410,9 @@ static vsi_nn_kernel_node_t _setup float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); int32_t reshape_flg = vsi_nn_kernel_param_get_int32( params, "reshape_flg" ); - int32_t width = inputs[0]->attr.size[0]; - int32_t height = inputs[0]->attr.size[1]; - int32_t group_num = (width + 15) / 16; + size_t width = inputs[0]->attr.size[0]; + size_t height = inputs[0]->attr.size[1]; + int32_t group_num = (int32_t)(width + 15) / 16; int32_t input_zp = 0; float input_scale = 1.0f; int32_t input_fl = 0; @@ -460,7 +460,7 @@ static vsi_nn_kernel_node_t _setup output_zp = 0; } - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; @@ -504,7 +504,7 @@ static vsi_nn_kernel_node_t _setup if (reshape_flg) { - int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; shape[0] = inputs[0]->attr.size[0]; shape[1] = inputs[0]->attr.size[1] * inputs[0]->attr.size[2]; shape[2] = 1; @@ -519,7 +519,7 @@ static vsi_nn_kernel_node_t _setup } if (inputs[1]->attr.dim_num < 2) { - int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; shape[0] = inputs[1]->attr.size[0]; shape[1] = 1; shape[2] = 1; @@ -528,7 +528,7 @@ static vsi_nn_kernel_node_t _setup } if (inputs[2]->attr.dim_num < 2) { - int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; shape[0] = inputs[2]->attr.size[0]; shape[1] = 1; shape[2] = 1; diff --git a/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c b/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c index 8d4d7b3..2250a8d 100644 --- a/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/l2normalizescale_cl.c @@ -114,7 +114,7 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer) }; int32_t axis = 0; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; - vsi_int_array_t * output_shape = NULL; + vsi_size_array_t * output_shape = NULL; output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); @@ -258,7 +258,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; vsi_bool image_2d = FALSE; int32_t axis = 0; - int32_t axis_size = 0; + vsi_size_t axis_size = 0; float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 1.0f : outputs[0]->attr.dtype.scale; float outputTail = (float)outputs[0]->attr.dtype.zero_point; float inputScale = inputs[0]->attr.dtype.scale == 0.0f ? 1.0f : inputs[0]->attr.dtype.scale; @@ -272,9 +272,9 @@ static vsi_nn_kernel_node_t _setup axis = vsi_nn_kernel_param_get_int32(params, "axis"); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) || axis > 2) { diff --git a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c index 166f779..7824a1e 100644 --- a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c @@ -118,10 +118,10 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer) }; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; - vsi_int_array_t * input_shape = NULL; + vsi_size_array_t * input_shape = NULL; //int32_t width = 0; - int32_t height = 0; - int32_t chn = 0; + vsi_ssize_t height = 0; + vsi_ssize_t chn = 0; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -231,8 +231,8 @@ static vsi_nn_kernel_node_t _setup float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); - int32_t width = inputs[0]->attr.size[0]; - int32_t height = inputs[0]->attr.size[1]; + vsi_size_t width = inputs[0]->attr.size[0]; + vsi_size_t height = inputs[0]->attr.size[1]; int32_t input_fl = 0; float input_zp = 0.0f; float input_scale = 1.0f; @@ -288,7 +288,7 @@ static vsi_nn_kernel_node_t _setup zp2ScaleE2 = input_zp * 2 * e2InScale; sumZpScaleE2 = width * input_zp * input_zp * e2InScale; - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; @@ -302,7 +302,7 @@ static vsi_nn_kernel_node_t _setup if (inputs[1]->attr.dim_num < 2) { - int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; shape[0] = inputs[1]->attr.size[0]; shape[1] = 1; shape[2] = 1; @@ -311,7 +311,7 @@ static vsi_nn_kernel_node_t _setup } if (inputs[2]->attr.dim_num < 2) { - int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; shape[0] = inputs[2]->attr.size[0]; shape[1] = 1; shape[2] = 1; diff --git a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c index 8d1e439..81b0d1b 100644 --- a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c @@ -145,7 +145,7 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer) vsi_status status = VSI_FAILURE; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; int32_t axis = 0; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); @@ -251,7 +251,7 @@ static vsi_nn_kernel_node_t _setup scaleValue = scaleValue * beta * inputScale; beta = beta * inputScale; - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) || axis > 2) { diff --git a/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c b/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c index 1e0780e..bf63043 100644 --- a/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/logical_not_cl.c @@ -108,7 +108,7 @@ DEF_KERNEL_INITIALIZER(_logical_not_initializer) vx_status status = VX_FAILURE; vx_tensor output = (vx_tensor)param[1]; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; - vsi_int_array_t *output_shape = NULL; + vsi_size_array_t *output_shape = NULL; output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); @@ -213,7 +213,7 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c b/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c index c02e1c1..d21317c 100644 --- a/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/logical_ops_cl.c @@ -114,7 +114,7 @@ DEF_KERNEL_INITIALIZER(_logical_ops_initializer) vx_status status = VX_FAILURE; vx_tensor output = (vx_tensor)param[2]; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; - vsi_int_array_t *output_shape = NULL; + vsi_size_array_t *output_shape = NULL; output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); @@ -230,7 +230,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; uint32_t ops_type = vsi_nn_kernel_param_get_int32( params, "ops_type" ); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c index 5ccc69e..e28a548 100644 --- a/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/matrixmul_cl.c @@ -140,9 +140,9 @@ DEF_KERNEL_INITIALIZER(_matrixmul_initializer) }; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; - int32_t width = 0; - int32_t height = 0; - int32_t chn = 0; + vsi_ssize_t width = 0; + vsi_ssize_t height = 0; + vsi_ssize_t chn = 0; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -178,7 +178,7 @@ static vsi_status _query_kernel vsi_nn_kernel_t * kernel, vsi_nn_tensor_t * const * const inputs, vsi_nn_tensor_t * const * const outputs, - int32_t depth, + vsi_size_t depth, int32_t transa ) { @@ -247,10 +247,10 @@ static vsi_nn_kernel_node_t _setup int32_t transposeA = vsi_nn_kernel_param_get_int32( params, "transposeA" ); int32_t transposeB = vsi_nn_kernel_param_get_int32( params, "transposeB" ); int32_t transFlg = 0; - uint32_t M = inputs[0]->attr.size[1]; - uint32_t K = inputs[0]->attr.size[0]; - uint32_t N = inputs[1]->attr.size[0]; - uint32_t depth = outputs[0]->attr.dim_num > 2 ? outputs[0]->attr.size[2] : 1; + vsi_size_t M = inputs[0]->attr.size[1]; + vsi_size_t K = inputs[0]->attr.size[0]; + vsi_size_t N = inputs[1]->attr.size[0]; + vsi_size_t depth = outputs[0]->attr.dim_num > 2 ? outputs[0]->attr.size[2] : 1; uint32_t ac2zero = 0; uint32_t bc2zero = 0; float scale_a = 1.0f; @@ -260,7 +260,7 @@ static vsi_nn_kernel_node_t _setup float scale_out = 1.0f; float zp_out = 0; - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c index d4ad975..98a175f 100644 --- a/src/tim/vx/internal/src/kernel/cl/maximum_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/maximum_cl.c @@ -137,7 +137,7 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer) vsi_status status = VSI_FAILURE; vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -248,7 +248,7 @@ static vsi_nn_kernel_node_t _setup outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale; - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c index d4f05cb..a730f0b 100644 --- a/src/tim/vx/internal/src/kernel/cl/minimum_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/minimum_cl.c @@ -136,7 +136,7 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer) vsi_status status = VSI_FAILURE; vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -247,7 +247,7 @@ static vsi_nn_kernel_node_t _setup outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale; - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/moments_cl.c b/src/tim/vx/internal/src/kernel/cl/moments_cl.c index 59e3efa..0a04c13 100644 --- a/src/tim/vx/internal/src/kernel/cl/moments_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/moments_cl.c @@ -98,24 +98,19 @@ typedef struct static const _kernel_map_type moments_map[] = { // Register kernel here - TENSOR_MOMENTS_KERNELS(U8, F16, 0, KERNEL_SOURCE_1) - TENSOR_MOMENTS_KERNELS(F16, F16, 0, KERNEL_SOURCE_1) + TENSOR_MOMENTS_KERNELS(U8, F32, 0, KERNEL_SOURCE_1) TENSOR_MOMENTS_KERNELS(F32, F32, 0, KERNEL_SOURCE_1) TENSOR_MOMENTS_KERNELS(I32, F32, 0, KERNEL_SOURCE_1) - TENSOR_MOMENTS_KERNELS(U8, F16, 1, KERNEL_SOURCE_2) - TENSOR_MOMENTS_KERNELS(F16, F16, 1, KERNEL_SOURCE_2) + TENSOR_MOMENTS_KERNELS(U8, F32, 1, KERNEL_SOURCE_2) TENSOR_MOMENTS_KERNELS(F32, F32, 1, KERNEL_SOURCE_2) TENSOR_MOMENTS_KERNELS(I32, F32, 1, KERNEL_SOURCE_2) - TENSOR_MOMENTS_KERNELS(U8, F16, 2, KERNEL_SOURCE_3) - TENSOR_MOMENTS_KERNELS(F16, F16, 2, KERNEL_SOURCE_3) + TENSOR_MOMENTS_KERNELS(U8, F32, 2, KERNEL_SOURCE_3) TENSOR_MOMENTS_KERNELS(F32, F32, 2, KERNEL_SOURCE_3) TENSOR_MOMENTS_KERNELS(I32, F32, 2, KERNEL_SOURCE_3) - TENSOR_MOMENTS_TWO_AXIS_KERNELS(U8, F16, 0, 1, KERNEL_SOURCE_4) - TENSOR_MOMENTS_TWO_AXIS_KERNELS(F16, F16, 0, 1, KERNEL_SOURCE_4) + TENSOR_MOMENTS_TWO_AXIS_KERNELS(U8, F32, 0, 1, KERNEL_SOURCE_4) TENSOR_MOMENTS_TWO_AXIS_KERNELS(F32, F32, 0, 1, KERNEL_SOURCE_4) TENSOR_MOMENTS_TWO_AXIS_KERNELS(I32, F32, 0, 1, KERNEL_SOURCE_4) - TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8, F16, 0, 1, 2, KERNEL_SOURCE_5) - TENSOR_MOMENTS_THREE_AXIS_KERNELS(F16, F16, 0, 1, 2, KERNEL_SOURCE_5) + TENSOR_MOMENTS_THREE_AXIS_KERNELS(U8, F32, 0, 1, 2, KERNEL_SOURCE_5) TENSOR_MOMENTS_THREE_AXIS_KERNELS(F32, F32, 0, 1, 2, KERNEL_SOURCE_5) TENSOR_MOMENTS_THREE_AXIS_KERNELS(I32, F32, 0, 1, 2, KERNEL_SOURCE_5) }; @@ -160,13 +155,13 @@ static int32_t set_constant_border static int32_t get_moments_output_reshape_size ( vsi_nn_tensor_t ** outputs, - int32_t sizes[VSI_NN_MAX_DIM_NUM], + vsi_size_t sizes[VSI_NN_MAX_DIM_NUM], int32_t* axis, int32_t axis_num ) { uint32_t out_dims_num = outputs[0]->attr.dim_num; - uint32_t *output_size = outputs[0]->attr.size; + vsi_size_t *output_size = outputs[0]->attr.size; uint32_t i = 0; int32_t out_rs_flg = 0; @@ -217,10 +212,10 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) }; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; - vsi_int_array_t * input_shape = NULL; - int32_t width = 0; - int32_t height = 0; - int32_t chn = 0; + vsi_size_array_t * input_shape = NULL; + vsi_ssize_t width = 0; + vsi_ssize_t height = 0; + vsi_ssize_t chn = 0; int32_t axis = 0; int32_t axis_num = 1; @@ -311,6 +306,15 @@ static vsi_status _query_kernel input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type ); + if (input0_dtype == I8 || input0_dtype == I16) + { + input0_dtype = I32; + } + else if (input0_dtype == F16) + { + input0_dtype = F32; + } + output_dtype = output_dtype == F16 ? F32 : output_dtype; key = HASH_MOMENTS_KEY( input0_dtype, output_dtype, axis_num, axis[0], axis[1], axis[2], rs_flg ); for( i = 0; i < _cnt_of_array(moments_map); i ++ ) @@ -353,8 +357,8 @@ static vsi_nn_kernel_node_t _setup vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t node_params[_MOMENTS_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; - int32_t out_shape[VSI_NN_MAX_DIM_NUM] = {0}; - int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t out_shape[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; int32_t out_rs_flg = 0; int32_t axis_num = 0; size_t axis_num_temp = 0; @@ -365,9 +369,9 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_scalar_t scalar_list[INTERNAL_MOMENTS_SCALAR_NUM] = {NULL}; vsi_nn_kernel_tensor_t reshape_tensors[3] = { NULL }; - int32_t width = inputs[0]->attr.size[0]; - int32_t height = inputs[0]->attr.size[1]; - int32_t chn = inputs[0]->attr.size[2]; + vsi_size_t width = inputs[0]->attr.size[0]; + vsi_size_t height = inputs[0]->attr.size[1]; + vsi_size_t chn = inputs[0]->attr.size[2]; int32_t input_zp = inputs[0]->attr.dtype.zero_point; float input_scale = inputs[0]->attr.dtype.scale; float dim_ratio = (float)1.0 / (float)(width * height); @@ -408,7 +412,7 @@ static vsi_nn_kernel_node_t _setup dim_ratio = (float)1.0 / (float)(width * height * chn); } - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; @@ -449,6 +453,7 @@ static vsi_nn_kernel_node_t _setup if ( node ) { uint32_t index = 0; + int32_t constant_value = 0; /* Pass parameters to node. */ if (reshape_tensors[0]) { @@ -488,7 +493,12 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_tensor_release( &node_params[1] ); vsi_nn_kernel_tensor_release( &node_params[2] ); } - status = set_constant_border(node, inputs[0]->attr.dtype.zero_point); + + if (inputs[0]->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC) + { + constant_value = inputs[0]->attr.dtype.zero_point; + } + status = set_constant_border(node, constant_value); CHECK_STATUS(status); } } @@ -521,4 +531,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_CL( moments, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c index bfbb653..3aa26fd 100644 --- a/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/one_hot_cl.c @@ -119,7 +119,7 @@ DEF_KERNEL_INITIALIZER(_one_hot_initializer) vsi_status status = VSI_FAILURE; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; - vsi_int_array_t * in_shape = NULL; + vsi_size_array_t * in_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -228,11 +228,11 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_ONE_HOT_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; vsi_nn_tensor_t* rs_tensors[2] = { NULL }; - int32_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }}; + vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }}; int32_t i = 0; - int32_t num_elements = vsi_nn_vxGetTensorElementNum(&inputs[0]->attr); - int32_t prefix_dim_size = 1; - int32_t suffix_dim_size = 0; + vsi_size_t num_elements = vsi_nn_vxGetTensorElementNum(&inputs[0]->attr); + vsi_size_t prefix_dim_size = 1; + vsi_size_t suffix_dim_size = 0; int32_t depth = vsi_nn_kernel_param_get_int32( params, "depth" ); vsi_nn_kernel_dtype_e out_dtype; uint32_t data[2] = {0}; @@ -270,11 +270,11 @@ static vsi_nn_kernel_node_t _setup shape[1][2] = prefix_dim_size; rs_tensors[0] = vsi_nn_reshape_tensor( graph, - inputs[0], (uint32_t*)shape[0], 2 ); + inputs[0], shape[0], 2 ); rs_tensors[1] = vsi_nn_reshape_tensor( graph, - outputs[0], (uint32_t*)shape[1], 3 ); + outputs[0], shape[1], 3 ); - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[1]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( rs_tensors[1]->attr.size, rs_tensors[1]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c b/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c index 5c1363c..73b264e 100644 --- a/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/poolwithargmax_cl.c @@ -114,7 +114,7 @@ DEF_KERNEL_INITIALIZER(_poolwithargmax_initializer) vx_status status = VX_FAILURE; vx_tensor output = (vx_tensor)param[1]; vsi_nn_kernel_tensor_attr_t * attr_out = NULL; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; vsi_bool image_2d = FALSE; attr_out = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); @@ -262,11 +262,11 @@ static vsi_nn_kernel_node_t _setup return NULL; } - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[1]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( outputs[1]->attr.size, outputs[1]->attr.dim_num )) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/pow_cl.c b/src/tim/vx/internal/src/kernel/cl/pow_cl.c index dea0bb0..56c0097 100644 --- a/src/tim/vx/internal/src/kernel/cl/pow_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/pow_cl.c @@ -118,7 +118,7 @@ DEF_KERNEL_INITIALIZER(_pow_initializer) vsi_status status = VSI_FAILURE; vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -220,7 +220,7 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/prelu_cl.c b/src/tim/vx/internal/src/kernel/cl/prelu_cl.c index 488eed9..7bbfbec 100644 --- a/src/tim/vx/internal/src/kernel/cl/prelu_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/prelu_cl.c @@ -134,7 +134,7 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer) vsi_status status = VSI_FAILURE; vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -236,8 +236,8 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; - int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; - uint32_t new_rank = 0; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + vsi_size_t new_rank = 0; vsi_bool ret; float input0Scale = inputs[0]->attr.dtype.scale; @@ -258,26 +258,26 @@ static vsi_nn_kernel_node_t _setup outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale; ret = vsi_nn_kernel_optimize_eltwise_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, - (int32_t *)inputs[1]->attr.size, inputs[1]->attr.dim_num, - (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[1]->attr.size, inputs[1]->attr.dim_num, + outputs[0]->attr.size, outputs[0]->attr.dim_num, shapes[0], shapes[1], shapes[2], &new_rank ); if (ret) { reshape_tensors[0] = vsi_nn_reshape_tensor( graph, - inputs[0], (uint32_t*)shapes[0], new_rank ); + inputs[0], shapes[0], (uint32_t)new_rank ); reshape_tensors[1] = vsi_nn_reshape_tensor( graph, - inputs[1], (uint32_t*)shapes[1], new_rank ); + inputs[1], shapes[1], (uint32_t)new_rank ); reshape_tensors[2] = vsi_nn_reshape_tensor( graph, - outputs[0], (uint32_t*)shapes[2], new_rank ); + outputs[0], shapes[2], (uint32_t)new_rank ); } else { return NULL; } - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[2]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size, reshape_tensors[2]->attr.dim_num ) ) { goto final; diff --git a/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c b/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c index 4b518be..696303b 100644 --- a/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/random_multinomial_cl.c @@ -151,7 +151,7 @@ DEF_KERNEL_INITIALIZER(_multinomial_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t * attr = NULL; - vsi_int_array_t * in_shape = NULL; + vsi_size_array_t * in_shape = NULL; attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final ); @@ -193,8 +193,8 @@ DEF_KERNEL_INITIALIZER(_cdf_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t * attr = NULL; - vsi_int_array_t * in_shape = NULL; - uint32_t batch = 0; + vsi_size_array_t * in_shape = NULL; + vsi_size_t batch = 0; attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final ); @@ -347,13 +347,13 @@ static vsi_nn_kernel_node_t _setup uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 }; uint32_t hashkey = 0; int32_t i; - uint32_t iteration = (outputs[0]->attr.size[0] + 3) / 4; + uint32_t iteration = (uint32_t)((outputs[0]->attr.size[0] + 3) / 4); float rand_max = (float)(pow(2.0,32)); float re_rand_max = 1 / rand_max; // Check if gpu can support the size if( !vsi_nn_kernel_gpu_check_shape( - (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) + outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; } @@ -380,7 +380,7 @@ static vsi_nn_kernel_node_t _setup attr.size[1] = 1; attr.dim_num = 2; tensors[SEEDS_INDEX] = vsi_nn_reshape_tensor( graph, - inputs[1], (uint32_t*)attr.size, attr.dim_num ); + inputs[1], attr.size, attr.dim_num ); in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); diff --git a/src/tim/vx/internal/src/kernel/cl/reduceall_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reduceall_internal_cl.c index 09d16b8..9b92246 100644 --- a/src/tim/vx/internal/src/kernel/cl/reduceall_internal_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/reduceall_internal_cl.c @@ -103,7 +103,7 @@ DEF_KERNEL_INITIALIZER(_reduceall_internal_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; - vsi_int_array_t * output_shape = NULL; + vsi_size_array_t * output_shape = NULL; output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); @@ -214,9 +214,9 @@ static vsi_nn_kernel_node_t _setup axis = vsi_nn_kernel_param_get_int32(params, "axis"); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) || axis > 2) { diff --git a/src/tim/vx/internal/src/kernel/cl/reduceany_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reduceany_internal_cl.c index e2d6bf8..b347758 100644 --- a/src/tim/vx/internal/src/kernel/cl/reduceany_internal_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/reduceany_internal_cl.c @@ -103,7 +103,7 @@ DEF_KERNEL_INITIALIZER(_reduceany_internal_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; - vsi_int_array_t * output_shape = NULL; + vsi_size_array_t * output_shape = NULL; output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); @@ -214,9 +214,9 @@ static vsi_nn_kernel_node_t _setup axis = vsi_nn_kernel_param_get_int32(params, "axis"); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) || axis > 2) { diff --git a/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c index 6f112bd..3e3d4bd 100644 --- a/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/reducemax_internal_cl.c @@ -119,7 +119,7 @@ DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; - vsi_int_array_t * output_shape = NULL; + vsi_size_array_t * output_shape = NULL; output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); @@ -246,9 +246,9 @@ static vsi_nn_kernel_node_t _setup axis = vsi_nn_kernel_param_get_int32(params, "axis"); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) || axis > 2) { diff --git a/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c index fb8fd84..1658fa4 100644 --- a/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/reducemin_internal_cl.c @@ -117,7 +117,7 @@ DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; - vsi_int_array_t * output_shape = NULL; + vsi_size_array_t * output_shape = NULL; output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); @@ -235,9 +235,9 @@ static vsi_nn_kernel_node_t _setup axis = vsi_nn_kernel_param_get_int32(params, "axis"); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) || axis > 2) { diff --git a/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c index 8972e7b..b1feb05 100644 --- a/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/reduceprod_internal_cl.c @@ -127,7 +127,7 @@ DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; - vsi_int_array_t * output_shape = NULL; + vsi_size_array_t * output_shape = NULL; output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); @@ -258,9 +258,9 @@ static vsi_nn_kernel_node_t _setup axis = vsi_nn_kernel_param_get_int32(params, "axis"); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) || axis > 2) { diff --git a/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c b/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c index 31d3925..d08676c 100644 --- a/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/relu_keras_cl.c @@ -124,7 +124,7 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t * output_attr = NULL; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); @@ -254,7 +254,7 @@ static vsi_nn_kernel_node_t _setup float threshold = vsi_nn_kernel_param_get_float32( params, "threshold" ); float offset = -alpha * threshold; - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/repeat_cl.c b/src/tim/vx/internal/src/kernel/cl/repeat_cl.c index d133782..c2f28dd 100644 --- a/src/tim/vx/internal/src/kernel/cl/repeat_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/repeat_cl.c @@ -124,8 +124,8 @@ DEF_KERNEL_INITIALIZER(_repeat_initializer) }; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; - vsi_int_array_t * input_shape = NULL; - int32_t height = 0, width = 0, chn = 0; + vsi_size_array_t * input_shape = NULL; + vsi_ssize_t height = 0, width = 0, chn = 0; int32_t is1d = 0; int32_t axis = 0; @@ -237,9 +237,9 @@ static int32_t _optimize_repeat_shape vsi_nn_tensor_t ** inputs, vsi_nn_tensor_t ** outputs, int32_t* axis, - int32_t* opt_shape_in, - int32_t* opt_shape_out, - int32_t* new_rank + vsi_size_t* opt_shape_in, + vsi_size_t* opt_shape_out, + vsi_size_t* new_rank ) { vsi_status status = VSI_SUCCESS; @@ -255,7 +255,7 @@ static int32_t _optimize_repeat_shape } else if (axis[0] == 3) { - vsi_nn_kernel_optimize_element_shape( (int32_t*)inputs[0]->attr.size, 3, opt_shape_in, new_rank ); + vsi_nn_kernel_optimize_element_shape( inputs[0]->attr.size, 3, opt_shape_in, new_rank ); if (opt_shape_in[1] == 1) { opt_shape_in[1] = inputs[0]->attr.size[3]; @@ -300,15 +300,15 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_REPEAT_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; vsi_nn_kernel_tensor_t rs_input = NULL, rs_input1 = NULL, rs_output = NULL; - int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }, { 1, 1, 1, 1 }}; - int32_t new_rank[2] = {0, 0}; + vsi_size_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }, { 1, 1, 1, 1 }}; + vsi_size_t new_rank[2] = {0, 0}; int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); - int32_t width = inputs[0]->attr.size[0]; - int32_t height = inputs[0]->attr.dim_num > 1 ? inputs[0]->attr.size[1] : 1; - int32_t channel = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1; + vsi_size_t width = inputs[0]->attr.size[0]; + vsi_size_t height = inputs[0]->attr.dim_num > 1 ? inputs[0]->attr.size[1] : 1; + vsi_size_t channel = inputs[0]->attr.dim_num > 2 ? inputs[0]->attr.size[2] : 1; - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c index f8ff904..3a189f4 100644 --- a/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/resize_1d_bilinear_cl.c @@ -114,7 +114,7 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t * output_attr = NULL; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); @@ -233,8 +233,8 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); - int32_t in_width = inputs[0]->attr.size[0]; - int32_t out_width = outputs[0]->attr.size[0]; + vsi_size_t in_width = inputs[0]->attr.size[0]; + vsi_size_t out_width = outputs[0]->attr.size[0]; float input_zp = (float)inputs[0]->attr.dtype.zero_point; float input_scale = inputs[0]->attr.dtype.scale; float input_tail = -(input_zp * input_scale); diff --git a/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c index 5b0f9a4..e406397 100644 --- a/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/resize_1d_nearest_cl.c @@ -115,7 +115,7 @@ DEF_KERNEL_INITIALIZER(_resize_1d_nearest_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t * output_attr = NULL; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); @@ -233,8 +233,8 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); - int32_t in_width = inputs[0]->attr.size[0]; - int32_t out_width = outputs[0]->attr.size[0]; + vsi_size_t in_width = inputs[0]->attr.size[0]; + vsi_size_t out_width = outputs[0]->attr.size[0]; float input_zp = (float)inputs[0]->attr.dtype.zero_point; float input_scale = inputs[0]->attr.dtype.scale; float output_scale = (0 == outputs[0]->attr.dtype.scale) ? \ diff --git a/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c index 26f5051..320a6d9 100644 --- a/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c @@ -113,7 +113,7 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t * output_attr = NULL; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); @@ -233,10 +233,10 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); - int32_t in_width = inputs[0]->attr.size[0]; - int32_t in_height = inputs[0]->attr.size[1]; - int32_t out_width = outputs[0]->attr.size[0]; - int32_t out_height = outputs[0]->attr.size[1]; + vsi_size_t in_width = inputs[0]->attr.size[0]; + vsi_size_t in_height = inputs[0]->attr.size[1]; + vsi_size_t out_width = outputs[0]->attr.size[0]; + vsi_size_t out_height = outputs[0]->attr.size[1]; float input_zp = (float)inputs[0]->attr.dtype.zero_point; float input_scale = inputs[0]->attr.dtype.scale; float input_tail = -(input_zp * input_scale); diff --git a/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c index b071fdf..588b527 100644 --- a/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/resize_nearest_cl.c @@ -119,7 +119,7 @@ DEF_KERNEL_INITIALIZER(_resize_nearest_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t * output_attr = NULL; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); @@ -237,10 +237,10 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; int32_t align_corners = vsi_nn_kernel_param_get_int32( params, "align_corners" ); int32_t half_pixel_centers = vsi_nn_kernel_param_get_int32( params, "half_pixel_centers" ); - int32_t in_width = inputs[0]->attr.size[0]; - int32_t in_height = inputs[0]->attr.size[1]; - int32_t out_width = outputs[0]->attr.size[0]; - int32_t out_height = outputs[0]->attr.size[1]; + vsi_size_t in_width = inputs[0]->attr.size[0]; + vsi_size_t in_height = inputs[0]->attr.size[1]; + vsi_size_t out_width = outputs[0]->attr.size[0]; + vsi_size_t out_height = outputs[0]->attr.size[1]; float input_zp = (float)inputs[0]->attr.dtype.zero_point; float input_scale = inputs[0]->attr.dtype.scale; float output_scale = (0 == outputs[0]->attr.dtype.scale) ? \ diff --git a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c index 3f4402a..47c896c 100644 --- a/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/roi_align_cl.c @@ -119,8 +119,8 @@ DEF_KERNEL_INITIALIZER(_roi_align_initializer) }; vsi_nn_kernel_tensor_attr_t * rois_attr = NULL; vsi_nn_kernel_tensor_attr_t * output_attr = NULL; - vsi_int_array_t * rois_shape = NULL; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * rois_shape = NULL; + vsi_size_array_t * out_shape = NULL; rois_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( rois_attr, "Create tensor attr buffer fail.", final ); @@ -235,7 +235,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; vsi_bool image_2d = FALSE; uint32_t rank[_IO_NUM] = {0}; - int32_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + vsi_size_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL }; int32_t i = 0; float width_ratio = vsi_nn_kernel_param_get_float32( params, "width_ratio" ); @@ -250,26 +250,26 @@ static vsi_nn_kernel_node_t _setup float rcp_of_out_height = 1.0f / (float)(outputs[0]->attr.size[1]); float sampling_x_ratio = width_sample_num > 0 ? (float)width_sample_num : 0; float sampling_y_ratio = height_sample_num > 0 ? (float)height_sample_num : 0; - int depth = inputs[0]->attr.size[2]; + vsi_size_t depth = inputs[0]->attr.size[2]; - vsi_nn_kernel_optimize_nchw2xhw_shape( (const int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, + vsi_nn_kernel_optimize_nchw2xhw_shape( (const vsi_size_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, shapes[0], &rank[0]); - vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, + vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, shapes[1], &rank[1]); - vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[2]->attr.size, inputs[2]->attr.dim_num, + vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)inputs[2]->attr.size, inputs[2]->attr.dim_num, shapes[2], &rank[2]); - vsi_nn_kernel_optimize_nchw2xhw_shape( (const int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, + vsi_nn_kernel_optimize_nchw2xhw_shape( (const vsi_size_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, shapes[3], &rank[3]); for (i = 0; i < _INPUT_NUM; i++) { reshape_tensors[i] = vsi_nn_reshape_tensor( graph, - inputs[i], (uint32_t*)shapes[i], rank[i] ); + inputs[i], shapes[i], rank[i] ); } reshape_tensors[_INPUT_NUM] = vsi_nn_reshape_tensor( graph, - outputs[0], (uint32_t*)shapes[_INPUT_NUM], rank[_INPUT_NUM] ); + outputs[0], shapes[_INPUT_NUM], rank[_INPUT_NUM] ); - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[0]->attr.size, inputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c b/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c index 21f82b8..1eba1c2 100644 --- a/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_cl.c @@ -96,19 +96,19 @@ static vx_param_description_t _scatter_nd_kernel_param_def[] = static vsi_status cal_scatter_nd_tensor_reshape_size ( vsi_nn_tensor_t ** inputs, - int32_t sizes[VSI_NN_MAX_DIM_NUM], + vsi_size_t sizes[VSI_NN_MAX_DIM_NUM], uint32_t block_size, uint32_t coordDim, - uint32_t* width, - uint32_t* area, + vsi_size_t* width, + vsi_size_t* area, int32_t* newDim ) { vsi_status status = VSI_FAILURE; uint32_t dims_num = inputs[0]->attr.dim_num; - uint32_t *input_size = inputs[0]->attr.size; + vsi_size_t *input_size = inputs[0]->attr.size; uint32_t i = 0; - uint32_t elementCnt = 1; + vsi_size_t elementCnt = 1; if(coordDim != 0 && (width == NULL || area == NULL)) { @@ -180,8 +180,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_initializer) }; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; - int32_t block_size = 0; - int32_t height = 0; + vsi_ssize_t block_size = 0; + vsi_ssize_t height = 0; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -280,12 +280,12 @@ static vsi_nn_kernel_node_t _setup vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t node_params[_SCATTER_ND_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; - int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" ); int32_t idx_num = vsi_nn_kernel_param_get_int32( params, "idx_num" ); int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0; - uint32_t width = 0, area = 0; + vsi_size_t width = 0, area = 0; status = cal_scatter_nd_tensor_reshape_size(&inputs[0], shapes[0], coord_dim, 0, NULL, NULL, &rs_in_dim); status |= cal_scatter_nd_tensor_reshape_size(&inputs[1], shapes[1], block_size, 0, NULL, NULL, &rs_idx_dim); @@ -296,7 +296,7 @@ static vsi_nn_kernel_node_t _setup return NULL; } - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c index 4f2e8c7..2ab4a16 100644 --- a/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/scatter_nd_update_cl.c @@ -88,20 +88,20 @@ static vx_param_description_t _scatter_nd_update_kernel_param_def[] = static vsi_status cal_scatter_nd_update_tensor_reshape_size ( vsi_nn_tensor_t ** inputs, - int32_t sizes[VSI_NN_MAX_DIM_NUM], + vsi_size_t sizes[VSI_NN_MAX_DIM_NUM], uint32_t block_size, uint32_t coordDim, - uint32_t* width, - uint32_t* area, - uint32_t* vol, + vsi_size_t* width, + vsi_size_t* area, + vsi_size_t* vol, int32_t* newDim ) { vsi_status status = VSI_FAILURE; uint32_t dims_num = inputs[0]->attr.dim_num; - uint32_t *input_size = inputs[0]->attr.size; + vsi_size_t *input_size = inputs[0]->attr.size; uint32_t i = 0; - uint32_t elementCnt = 1; + vsi_size_t elementCnt = 1; if (coordDim != 0 && (width == NULL || area == NULL)) { @@ -185,8 +185,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer) }; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; - int32_t block_size = 0; - int32_t height = 0; + vsi_ssize_t block_size = 0; + vsi_ssize_t height = 0; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -274,12 +274,12 @@ static vsi_nn_kernel_node_t _setup vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t node_params[_SCATTER_ND_UPDATE_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; - int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" ); int32_t idx_num = vsi_nn_kernel_param_get_int32( params, "idx_num" ); int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0; - uint32_t width = 0, area = 0, vol = 0; + vsi_size_t width = 0, area = 0, vol = 0; int32_t offsetX = 0, offsetY = 0, offsetZ = 0, offsetW = 0, offset_idx = 0; status = cal_scatter_nd_update_tensor_reshape_size(&inputs[1], shapes[0], @@ -293,7 +293,7 @@ static vsi_nn_kernel_node_t _setup return NULL; } - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; @@ -305,21 +305,21 @@ static vsi_nn_kernel_node_t _setup } if (coord_dim == 4 || coord_dim == 5) { - offsetX = vol; - offsetY = area; - offsetZ = width; + offsetX = (int32_t)vol; + offsetY = (int32_t)area; + offsetZ = (int32_t)width; offsetW = 1; } else if (coord_dim == 3) { - offsetX = area; - offsetY = width; + offsetX = (int32_t)area; + offsetY = (int32_t)width; offsetZ = 1; offsetW = 0; } else if (coord_dim == 2) { - offsetX = width; + offsetX = (int32_t)width; offsetY = 1; offsetZ = 0; offsetW = 0; diff --git a/src/tim/vx/internal/src/kernel/cl/select_cl.c b/src/tim/vx/internal/src/kernel/cl/select_cl.c index 42c0caa..9c00e23 100644 --- a/src/tim/vx/internal/src/kernel/cl/select_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/select_cl.c @@ -123,7 +123,7 @@ DEF_KERNEL_INITIALIZER(_select_initializer) vx_tensor output = (vx_tensor)param[3]; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; - vsi_int_array_t *output_shape = NULL; + vsi_size_array_t *output_shape = NULL; output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output); CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); @@ -252,7 +252,7 @@ static vsi_nn_kernel_node_t _setup input0Tail = outputZP - input0Tail * input0Scale; input1Tail = outputZP - input1Tail * input1Scale; - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c b/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c index 45e606e..58fed76 100644 --- a/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/sequence_mask_cl.c @@ -114,7 +114,7 @@ DEF_KERNEL_INITIALIZER(_sequence_mask_initializer) vsi_status status = VSI_FAILURE; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -194,14 +194,14 @@ static int32_t _optimize_mask_shape vsi_nn_tensor_t ** inputs, vsi_nn_tensor_t ** outputs, int32_t max_len, - int32_t* opt_shape_in, - int32_t* opt_shape_out, + vsi_size_t* opt_shape_in, + vsi_size_t* opt_shape_out, int32_t* is2Dflg ) { vsi_status status = VSI_SUCCESS; - int32_t in_shape[VSI_NN_MAX_DIM_NUM] = {0}; - int32_t new_rank = 0; + vsi_size_t in_shape[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t new_rank = 0; uint32_t i = 0; for(i = 0; i < inputs[0]->attr.dim_num; i++) @@ -242,7 +242,7 @@ static vsi_nn_kernel_node_t _setup vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t node_params[_CL_PARAM_NUM] = {NULL}; vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL; - int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }, { 1, 1, 1, 1 }}; + vsi_size_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }, { 1, 1, 1, 1 }}; int32_t max_len = vsi_nn_kernel_param_get_int32( params, "max_len" ); vsi_nn_kernel_node_t node = NULL; int32_t is2Dflg = 0; @@ -255,7 +255,7 @@ static vsi_nn_kernel_node_t _setup int32_t input_fl = 0; int32_t output_fl = 0; - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/signal_frame_cl.c b/src/tim/vx/internal/src/kernel/cl/signal_frame_cl.c index f73089c..7aee0e0 100644 --- a/src/tim/vx/internal/src/kernel/cl/signal_frame_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/signal_frame_cl.c @@ -93,7 +93,7 @@ DEF_KERNEL_INITIALIZER(_signal_frame_initializer) vsi_status status = VSI_FAILURE; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -190,14 +190,14 @@ static vsi_nn_kernel_node_t _setup int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); int32_t pad_end = vsi_nn_kernel_param_get_int32( params, "pad_end" ); float pad_value = vsi_nn_kernel_param_get_float32( params, "pad_val" ); - int32_t num_frames = outputs[0]->attr.size[axis + 1]; + vsi_size_t num_frames = outputs[0]->attr.size[axis + 1]; int32_t rank = inputs[0]->attr.dim_num; - int32_t inner = 1; - int32_t outer = 1; - int32_t length_samples = inputs[0]->attr.size[axis]; + vsi_size_t inner = 1; + vsi_size_t outer = 1; + vsi_size_t length_samples = inputs[0]->attr.size[axis]; int32_t i = 0; vsi_nn_tensor_t* rs_tensors[2] = { NULL }; - int32_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }}; + vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }}; for (i = 0; i < axis; i++) { @@ -220,11 +220,11 @@ static vsi_nn_kernel_node_t _setup shape[1][3] = outer; rs_tensors[0] = vsi_nn_reshape_tensor( graph, - inputs[0], (uint32_t*)shape[0], 4 ); + inputs[0], shape[0], 4 ); rs_tensors[1] = vsi_nn_reshape_tensor( graph, - outputs[0], (uint32_t*)shape[1], 4 ); + outputs[0], shape[1], 4 ); - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[1]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( rs_tensors[1]->attr.size, rs_tensors[1]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/slice_cl.c b/src/tim/vx/internal/src/kernel/cl/slice_cl.c index f0da3cb..ed83e5f 100644 --- a/src/tim/vx/internal/src/kernel/cl/slice_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/slice_cl.c @@ -126,7 +126,7 @@ DEF_KERNEL_INITIALIZER(_slice_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t * output_attr = NULL; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); CHECK_PTR_FAIL_GOTO( output_attr, "Create tensor attr buffer fail.", final ); @@ -235,11 +235,11 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; vsi_bool image_2d = FALSE; uint32_t rank[_IO_NUM] = {0}; - int32_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + vsi_size_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL }; int32_t i = 0; - int32_t input_batch = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; - int32_t output_batch = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1; + vsi_size_t input_batch = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; + vsi_size_t output_batch = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1; float inputScale = inputs[0]->attr.dtype.scale; float inputTail = (float)inputs[0]->attr.dtype.zero_point * inputScale; float outputScale = outputs[0]->attr.dtype.scale; @@ -247,22 +247,22 @@ static vsi_nn_kernel_node_t _setup outputScale = vsi_abs(outputScale) < 1e-5 ? 0.0f : 1.0f / outputScale; - vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, + vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, shapes[0], &rank[0]); - vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, + vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, shapes[1], &rank[1]); - vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, + vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, shapes[2], &rank[2]); for (i = 0; i < _INPUT_NUM; i++) { reshape_tensors[i] = vsi_nn_reshape_tensor( graph, - inputs[i], (uint32_t*)shapes[i], rank[i] ); + inputs[i], shapes[i], rank[i] ); } reshape_tensors[_INPUT_NUM] = vsi_nn_reshape_tensor( graph, - outputs[0], (uint32_t*)shapes[_INPUT_NUM], rank[_INPUT_NUM] ); + outputs[0], shapes[_INPUT_NUM], rank[_INPUT_NUM] ); - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[0]->attr.size, inputs[0]->attr.dim_num ) || input_batch != output_batch ) { goto final; diff --git a/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c b/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c index b021962..65b09ee 100644 --- a/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/space2depth_internal_cl.c @@ -109,10 +109,10 @@ DEF_KERNEL_INITIALIZER(_space2depth_internal_initializer) vsi_status status = VSI_FAILURE; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; - vsi_int_array_t * in_shape = NULL; - int32_t width = 0; - int32_t height = 0; - int32_t chn = 0; + vsi_size_array_t * in_shape = NULL; + vsi_ssize_t width = 0; + vsi_ssize_t height = 0; + vsi_ssize_t chn = 0; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -255,7 +255,7 @@ static vsi_nn_kernel_node_t _setup scaleInOut = inputScale / outputScale; zpInOut = outputZp - inputZp * scaleInOut; - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cl/swish_cl.c b/src/tim/vx/internal/src/kernel/cl/swish_cl.c index 9012c93..4a4283e 100644 --- a/src/tim/vx/internal/src/kernel/cl/swish_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/swish_cl.c @@ -36,7 +36,7 @@ #include "utils/vsi_nn_util.h" #include "utils/vsi_nn_math.h" #include "kernel/vsi_nn_kernel.h" -#include "libnnext/vx_lib_nnext.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" __BEGIN_DECLS @@ -168,7 +168,7 @@ DEF_KERNEL_INITIALIZER(_swish_initializer) vx_status status = VX_FAILURE; vx_tensor output = (vx_tensor)param[1]; vsi_nn_kernel_tensor_attr_t * attr_out = NULL; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr_out = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output ); CHECK_PTR_FAIL_GOTO( attr_out, "vsi_nn_kernel_tensor_attr_create fail.", final ); @@ -273,8 +273,8 @@ static vsi_nn_kernel_node_t _setup { vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t node_params[_SWISH_PARAM_NUM] = {NULL}; - int32_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}}; - uint32_t new_rank = 0; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t new_rank = 0; vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; int32_t swish_type = vsi_nn_kernel_param_get_int32( params, "type" ); @@ -284,6 +284,7 @@ static vsi_nn_kernel_node_t _setup float outputScale = outputs[0]->attr.dtype.scale == 0.0f ? 0.0f : 1.0f / outputs[0]->attr.dtype.scale; float outputZP = (float)outputs[0]->attr.dtype.zero_point + 0.5f; vx_float32 logE = (vx_float32)(log10(exp(1.0f)) / log10(2.0f)); + vsi_bool ret = FALSE; #if (VX_ACTIVATION_EXT_SUPPORT) if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver) @@ -292,10 +293,17 @@ static vsi_nn_kernel_node_t _setup } #endif - vsi_nn_OptimizedEltOPShape(inputs[0], (uint32_t *)(shapes[0]), &new_rank); - vsi_nn_OptimizedEltOPShape(outputs[0], (uint32_t *)(shapes[1]), &new_rank); + ret = vsi_nn_kernel_optimize_element_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, + shape, &new_rank ); - if( !vsi_nn_kernel_gpu_check_shape( shapes[0], new_rank ) ) + if( ret ) + { + node_params[0] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, new_rank ); + node_params[1] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, new_rank ); + } + + if( !vsi_nn_kernel_gpu_check_shape( shape, new_rank ) ) { return NULL; } @@ -318,8 +326,6 @@ static vsi_nn_kernel_node_t _setup if( node ) { size_t node_params_num = _SWISH_PARAM_NUM; - node_params[0] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], new_rank ); - node_params[1] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[1], new_rank ); node_params[SCALAR_INPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &inputScale ); node_params[SCALAR_INPUT_TAIL] = vsi_nn_kernel_scalar_create(graph, F32, &inputTail ); node_params[SCALAR_OUTPUT_SCALE] = vsi_nn_kernel_scalar_create( graph, F32, &outputScale ); @@ -338,8 +344,6 @@ static vsi_nn_kernel_node_t _setup status = vsi_nn_kernel_node_pass_param( node, node_params, node_params_num ); VSI_ASSERT( status == VSI_SUCCESS ); - vsi_nn_kernel_tensor_release( &node_params[0] ); - vsi_nn_kernel_tensor_release( &node_params[1] ); vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_SCALE] ); vsi_nn_kernel_scalar_release( &node_params[SCALAR_INPUT_TAIL] ); vsi_nn_kernel_scalar_release( &node_params[SCALAR_OUTPUT_SCALE] ); @@ -353,6 +357,15 @@ static vsi_nn_kernel_node_t _setup } } + if(node_params[0]) + { + vsi_nn_kernel_tensor_release( &node_params[0] ); + } + if(node_params[1]) + { + vsi_nn_kernel_tensor_release( &node_params[1] ); + } + return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/cl/tile_cl.c b/src/tim/vx/internal/src/kernel/cl/tile_cl.c index be790f2..dab13f7 100644 --- a/src/tim/vx/internal/src/kernel/cl/tile_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/tile_cl.c @@ -161,7 +161,7 @@ DEF_KERNEL_INITIALIZER(_tile_initializer) vsi_status status = VSI_FAILURE; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; - vsi_int_array_t * in_shape = NULL; + vsi_size_array_t * in_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final ); @@ -234,9 +234,9 @@ static vsi_status _query_kernel return status; } /* _query_kernel() */ -static vsi_bool _is_supported_axis(int32_t* multiples, uint32_t multiples_num) +static vsi_bool _is_supported_axis(vsi_size_t* multiples, vsi_size_t multiples_num) { - uint32_t i = 0; + vsi_size_t i = 0; if ( multiples_num < 4) { @@ -274,12 +274,12 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; - int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; uint32_t i = 0; - uint32_t new_rank = 0; + vsi_size_t new_rank = 0; vsi_bool ret = FALSE; uint32_t dim = inputs[0]->attr.dim_num; - int32_t multiples[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t multiples[VSI_NN_MAX_DIM_NUM] = { 0 }; for ( i = 0; i < dim; i++) { @@ -287,9 +287,9 @@ static vsi_nn_kernel_node_t _setup } ret = vsi_nn_kernel_optimize_tile_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, - (int32_t *)multiples, inputs[0]->attr.dim_num, - (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + inputs[0]->attr.size, inputs[0]->attr.dim_num, + multiples, inputs[0]->attr.dim_num, + outputs[0]->attr.size, outputs[0]->attr.dim_num, shapes[0], shapes[1], shapes[2], &new_rank ); if (ret) @@ -300,16 +300,16 @@ static vsi_nn_kernel_node_t _setup } reshape_tensors[0] = vsi_nn_reshape_tensor( graph, - inputs[0], (uint32_t*)shapes[0], new_rank ); + inputs[0], shapes[0], new_rank ); reshape_tensors[1] = vsi_nn_reshape_tensor( graph, - outputs[0], (uint32_t*)shapes[2], new_rank ); + outputs[0], shapes[2], new_rank ); } else { return NULL; } - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[1]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[1]->attr.size, outputs[0]->attr.dim_num )) { goto final; @@ -323,9 +323,9 @@ static vsi_nn_kernel_node_t _setup if( node ) { - uint32_t depthIn = new_rank > 2 ? reshape_tensors[0]->attr.size[2] : 1; - uint32_t depthOut = new_rank > 2 ? reshape_tensors[1]->attr.size[2] : 1; - uint32_t batchIn = new_rank > 3 ? reshape_tensors[0]->attr.size[3] : 1; + uint32_t depthIn = (uint32_t)(new_rank > 2 ? reshape_tensors[0]->attr.size[2] : 1); + uint32_t depthOut = (uint32_t)(new_rank > 2 ? reshape_tensors[1]->attr.size[2] : 1); + uint32_t batchIn = (uint32_t)(new_rank > 3 ? reshape_tensors[0]->attr.size[3] : 1); vsi_nn_kernel_node_pack_io( node_params, _CL_PARAM_NUM, &reshape_tensors[0], 1, &reshape_tensors[1], 1 ); diff --git a/src/tim/vx/internal/src/kernel/cl/upsample_cl.c b/src/tim/vx/internal/src/kernel/cl/upsample_cl.c index 3124f7b..f2e990c 100644 --- a/src/tim/vx/internal/src/kernel/cl/upsample_cl.c +++ b/src/tim/vx/internal/src/kernel/cl/upsample_cl.c @@ -126,7 +126,7 @@ DEF_KERNEL_INITIALIZER(_upsample_initializer) vx_status status = VX_FAILURE; vx_tensor input = (vx_tensor)param[0]; vsi_nn_kernel_tensor_attr_t * attr_in = NULL; - vsi_int_array_t * in_shape = NULL; + vsi_size_array_t * in_shape = NULL; vsi_bool image_2d = FALSE; attr_in = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input ); @@ -270,11 +270,11 @@ static vsi_nn_kernel_node_t _setup return NULL; } - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[1]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( inputs[1]->attr.size, inputs[1]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num )) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/cpu/add_mean_std_norm_cpu.c b/src/tim/vx/internal/src/kernel/cpu/add_mean_std_norm_cpu.c index d750765..293304d 100644 --- a/src/tim/vx/internal/src/kernel/cpu/add_mean_std_norm_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/add_mean_std_norm_cpu.c @@ -78,13 +78,13 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; int32_t i; float mean = .0f, stddev_inv = .0f, variance = .0f, input_d = .0f, data = .0f, eps = .0f; - int32_t v_size, n_batch, batch; + vsi_ssize_t v_size, n_batch, batch; /* prepare data */ for(i = 0; i < _INPUT_NUM; i ++) { @@ -114,10 +114,10 @@ DEF_KERNEL_EXECUTOR(_compute) { float sum = 0.0f; float sum_sq = 0.0f; - int32_t index_base = batch * v_size; + vsi_ssize_t index_base = batch * v_size; for (i = 0; i < v_size; ++i) { - int32_t index = i + index_base; + vsi_ssize_t index = i + index_base; input_d = f32_in_buffer[0][index] + f32_in_buffer[1][index]; sum += input_d; sum_sq += input_d * input_d; @@ -138,7 +138,7 @@ DEF_KERNEL_EXECUTOR(_compute) for (i = 0; i < v_size; ++i) { - int32_t index = i + index_base; + vsi_ssize_t index = i + index_base; input_d = f32_in_buffer[0][index] + f32_in_buffer[1][index]; data = (input_d - mean) * stddev_inv; f32_out_buffer[0][index] = data; diff --git a/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c b/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c index e9cb96f..52a5572 100644 --- a/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/argmax_cpu.c @@ -60,11 +60,11 @@ DEF_KERNEL_EXECUTOR(_argmax_exec) vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; int32_t i; int32_t axis = 0; - int32_t outerSize = 1; - int32_t axisSize = 1; - int32_t innerSize = 1; - int32_t inner = 0; - int32_t outer = 0; + vsi_ssize_t outerSize = 1; + vsi_ssize_t axisSize = 1; + vsi_ssize_t innerSize = 1; + vsi_ssize_t inner = 0; + vsi_ssize_t outer = 0; tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; diff --git a/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c b/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c index f6b092b..09aa235 100644 --- a/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/argmin_cpu.c @@ -61,11 +61,11 @@ DEF_KERNEL_EXECUTOR(_argmin_exec) vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; int32_t i; int32_t axis = 0; - int32_t outerSize = 1; - int32_t axisSize = 1; - int32_t innerSize = 1; - int32_t inner = 0; - int32_t outer = 0; + vsi_ssize_t outerSize = 1; + vsi_ssize_t axisSize = 1; + vsi_ssize_t innerSize = 1; + vsi_ssize_t inner = 0; + vsi_ssize_t outer = 0; tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; diff --git a/src/tim/vx/internal/src/kernel/cpu/axis_aligned_bbox_transform_cpu.c b/src/tim/vx/internal/src/kernel/cpu/axis_aligned_bbox_transform_cpu.c index 448eb33..8903139 100644 --- a/src/tim/vx/internal/src/kernel/cpu/axis_aligned_bbox_transform_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/axis_aligned_bbox_transform_cpu.c @@ -110,16 +110,16 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; uint32_t i; const uint32_t roiLength = 4; const uint32_t imageLength = 2; - uint32_t numClasses = 0; - uint32_t numRois = 0; - uint32_t j; - uint32_t roiIndex; + vsi_size_t numClasses = 0; + vsi_size_t numRois = 0; + vsi_size_t j; + vsi_size_t roiIndex; /* prepare data */ for (i = 0; i < _INPUT_NUM; i ++) @@ -163,7 +163,7 @@ DEF_KERNEL_EXECUTOR(_compute) vsi_nn_box_encoding_center roi_ctr; vsi_nn_box_encoding_corner roiAfter; vsi_nn_box_encoding_corner cliped; - uint32_t index = (roiIndex * numClasses + j) * roiLength; + vsi_size_t index = (roiIndex * numClasses + j) * roiLength; roi_ctr.w = (float)(exp(f32_in_buffer[1][index + 2]) * roiBefore.w); roi_ctr.h = (float)(exp(f32_in_buffer[1][index + 3]) * roiBefore.h); diff --git a/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c b/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c index ca6164b..0edbb7f 100644 --- a/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/batchnorm_single_cpu.c @@ -46,21 +46,21 @@ __BEGIN_DECLS #define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) #define _KERNEL_NAME CVIVANTE_NAMESPACE("batch_norm_sw") -static int32_t _expand_offset +static vsi_ssize_t _expand_offset ( - int32_t index, - int32_t * shape, size_t rank, - size_t * strides, int32_t * out_shape + vsi_ssize_t index, + vsi_size_t * shape, vsi_size_t rank, + vsi_size_t * strides, vsi_size_t * out_shape ) { - uint32_t i; - int32_t offset = 0; + vsi_size_t i; + vsi_ssize_t offset = 0; for( i = 0; i < rank && index; i ++ ) { if( shape[i] == out_shape[i] ) { - offset += (int32_t)strides[i] * ( index % out_shape[i] ); + offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] ); } index /= out_shape[i]; } @@ -77,8 +77,8 @@ DEF_KERNEL_EXECUTOR(_batch_norm_exec) vsi_status status = VX_SUCCESS; vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; float * buffer[_CPU_IO_NUM] = { NULL }; - size_t out_elements = 0; - size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_size_t out_elements = 0; + vsi_size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; uint32_t i = 0; float eps = 0.f; @@ -107,7 +107,7 @@ DEF_KERNEL_EXECUTOR(_batch_norm_exec) for( i = 0; i < out_elements; i ++ ) { - int32_t in_offset[5] = {0}; + vsi_ssize_t in_offset[5] = {0}; int32_t j = 0; float src = 0.f; float mean = 0.f; @@ -117,7 +117,7 @@ DEF_KERNEL_EXECUTOR(_batch_norm_exec) for ( j = 0; j < 5; j++) { - in_offset[j] = _expand_offset( i, attr[j]->shape->data, attr[j]->shape->size, + in_offset[j] = _expand_offset( i, attr[j]->shape->data, (vsi_size_t)attr[j]->shape->size, stride_size[j], attr[5]->shape->data ); } diff --git a/src/tim/vx/internal/src/kernel/cpu/box_with_nms_limit_cpu.c b/src/tim/vx/internal/src/kernel/cpu/box_with_nms_limit_cpu.c index c471f16..53af44c 100644 --- a/src/tim/vx/internal/src/kernel/cpu/box_with_nms_limit_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/box_with_nms_limit_cpu.c @@ -227,9 +227,9 @@ DEF_KERNEL_EXECUTOR(_compute) int32_t* int32_out_buffer[_OUTPUT_NUM] = {0}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; uint32_t i = 0; float score_threshold = 0; int32_t max_num_detections = 0; @@ -300,8 +300,8 @@ DEF_KERNEL_EXECUTOR(_compute) CHECK_STATUS_FAIL_GOTO(status, final ); #undef VSI_NN_KERNEL_READ_SCALAR - numRois = in_attr[0]->shape->data[1]; - numClasses = in_attr[0]->shape->data[0]; + numRois = (uint32_t)in_attr[0]->shape->data[1]; + numClasses = (uint32_t)in_attr[0]->shape->data[0]; batch_data = (uint32_t*)malloc(numRois * sizeof(uint32_t)); CHECK_PTR_FAIL_GOTO( batch_data, "Create batch_data fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/cpu/cast_cpu.c b/src/tim/vx/internal/src/kernel/cpu/cast_cpu.c index c716d94..fdc462f 100644 --- a/src/tim/vx/internal/src/kernel/cpu/cast_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/cast_cpu.c @@ -74,9 +74,9 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; uint32_t i; double max_value = 0.0f, min_value = 0.0f; vsi_bool clamp_flag = FALSE; diff --git a/src/tim/vx/internal/src/kernel/cpu/clip_cpu.c b/src/tim/vx/internal/src/kernel/cpu/clip_cpu.c index ea416a4..70c40cb 100644 --- a/src/tim/vx/internal/src/kernel/cpu/clip_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/clip_cpu.c @@ -79,9 +79,9 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; uint32_t i; float min_value = 0.0f; float max_value = 0.0f; diff --git a/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c b/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c index 8397b30..7e27725 100644 --- a/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/comparisons_cpu.c @@ -55,21 +55,21 @@ typedef enum } relational_type_e; -static int32_t _expand_offset +static vsi_ssize_t _expand_offset ( - int32_t index, - int32_t * shape, size_t rank, - size_t * strides, int32_t * out_shape + vsi_ssize_t index, + vsi_size_t * shape, vsi_size_t rank, + vsi_size_t * strides, vsi_size_t * out_shape ) { - uint32_t i; - int32_t offset = 0; + vsi_size_t i; + vsi_ssize_t offset = 0; for( i = 0; i < rank && index; i ++ ) { if( shape[i] == out_shape[i] ) { - offset += (int32_t)strides[i] * ( index % out_shape[i] ); + offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] ); } index /= out_shape[i]; } @@ -88,7 +88,7 @@ DEF_KERNEL_EXECUTOR(_comparisons_exec) float * buffer[_CPU_IO_NUM] = { NULL }; size_t out_elements = 0; vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; - size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; int32_t i = 0; int32_t operation = 0; @@ -124,15 +124,15 @@ DEF_KERNEL_EXECUTOR(_comparisons_exec) for (i = 0; i < (int32_t)out_elements; i++) { - int32_t in0_offset = 0; - int32_t in1_offset = 0; + vsi_ssize_t in0_offset = 0; + vsi_ssize_t in1_offset = 0; float val1 = 0.f; float val2 = 0.f; vsi_bool data = 0; - in0_offset = _expand_offset( i, attr[0]->shape->data, attr[0]->shape->size, + in0_offset = _expand_offset( i, attr[0]->shape->data, (vsi_size_t)attr[0]->shape->size, stride_size[0], attr[2]->shape->data ); - in1_offset = _expand_offset( i, attr[1]->shape->data, attr[1]->shape->size, + in1_offset = _expand_offset( i, attr[1]->shape->data, (vsi_size_t)attr[1]->shape->size, stride_size[1], attr[2]->shape->data ); val1 = buffer[0][in0_offset]; diff --git a/src/tim/vx/internal/src/kernel/cpu/conv1d_ovxlib_cpu.c b/src/tim/vx/internal/src/kernel/cpu/conv1d_ovxlib_cpu.c index f1b1b9e..4bdba10 100644 --- a/src/tim/vx/internal/src/kernel/cpu/conv1d_ovxlib_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/conv1d_ovxlib_cpu.c @@ -120,16 +120,16 @@ DEF_KERNEL_EXECUTOR(_compute) CHECK_STATUS_FAIL_GOTO(status, final); { - int32_t batch = attr[0]->shape->data[2]; - int32_t input_channel = attr[0]->shape->data[1]; - int32_t input_height = attr[0]->shape->data[0]; - int32_t kernel_size = attr[1]->shape->data[0]; - int32_t output_channel = attr[1]->shape->data[2]; - int32_t output_height = attr[3]->shape->data[0]; - int32_t batch_index = 0; - int32_t input_channel_index = 0; - int32_t output_channel_index = 0; - int32_t output_h_index = 0; + vsi_ssize_t batch = attr[0]->shape->data[2]; + vsi_ssize_t input_channel = attr[0]->shape->data[1]; + vsi_ssize_t input_height = attr[0]->shape->data[0]; + vsi_ssize_t kernel_size = attr[1]->shape->data[0]; + vsi_ssize_t output_channel = attr[1]->shape->data[2]; + vsi_ssize_t output_height = attr[3]->shape->data[0]; + vsi_ssize_t batch_index = 0; + vsi_ssize_t input_channel_index = 0; + vsi_ssize_t output_channel_index = 0; + vsi_ssize_t output_h_index = 0; for(batch_index = 0; batch_index < batch; batch_index++) { diff --git a/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c index aa96ba3..8217783 100644 --- a/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/depth2space_internal_cpu.c @@ -85,21 +85,21 @@ DEF_KERNEL_EXECUTOR(_depth2space_crd_exec) memset( buffer[1], 0, out_elements * sizeof(float) ); { - uint32_t output_batch = attr[1]->shape->size > 3 ? attr[1]->shape->data[3] : 1; - uint32_t output_depth = attr[1]->shape->data[2]; - uint32_t output_height = attr[1]->shape->data[1]; - uint32_t output_width = attr[1]->shape->data[0]; - uint32_t input_depth = attr[0]->shape->data[2]; - uint32_t input_height = attr[0]->shape->data[1]; - uint32_t input_width = attr[0]->shape->data[0]; - uint32_t batch = 0, out_h = 0, out_w = 0; + vsi_size_t output_batch = attr[1]->shape->size > 3 ? attr[1]->shape->data[3] : 1; + vsi_size_t output_depth = attr[1]->shape->data[2]; + vsi_size_t output_height = attr[1]->shape->data[1]; + vsi_size_t output_width = attr[1]->shape->data[0]; + vsi_size_t input_depth = attr[0]->shape->data[2]; + vsi_size_t input_height = attr[0]->shape->data[1]; + vsi_size_t input_width = attr[0]->shape->data[0]; + vsi_size_t batch = 0, out_h = 0, out_w = 0; for (batch = 0; batch < output_batch; ++ batch) { - uint32_t output_batch_index = batch * output_height * output_width * output_depth; - uint32_t input_batch_index = batch * input_height * input_width * input_depth; - uint32_t out_d = 0; - uint32_t block_e2 = block_size * block_size; + vsi_size_t output_batch_index = batch * output_height * output_width * output_depth; + vsi_size_t input_batch_index = batch * input_height * input_width * input_depth; + vsi_size_t out_d = 0; + vsi_size_t block_e2 = block_size * block_size; for (out_d = 0; out_d < output_depth; out_d ++) { @@ -107,13 +107,13 @@ DEF_KERNEL_EXECUTOR(_depth2space_crd_exec) { for (out_w = 0; out_w < output_width; out_w ++) { - uint32_t in_w = out_w / block_size; - uint32_t in_h = out_h / block_size; - uint32_t in_d = (out_w % block_size) + (out_h % block_size) * block_size + out_d * block_e2; + vsi_size_t in_w = out_w / block_size; + vsi_size_t in_h = out_h / block_size; + vsi_size_t in_d = (out_w % block_size) + (out_h % block_size) * block_size + out_d * block_e2; - uint32_t in_index = in_w + in_h * input_width + in_d * input_width * input_height + vsi_size_t in_index = in_w + in_h * input_width + in_d * input_width * input_height + input_batch_index; - uint32_t out_index = out_w + out_h * output_width + out_d * output_height * output_width + vsi_size_t out_index = out_w + out_h * output_width + out_d * output_height * output_width + output_batch_index; buffer[1][out_index] = buffer[0][in_index]; diff --git a/src/tim/vx/internal/src/kernel/cpu/detect_post_box_cpu.c b/src/tim/vx/internal/src/kernel/cpu/detect_post_box_cpu.c index 72d9dd7..8fd8da0 100644 --- a/src/tim/vx/internal/src/kernel/cpu/detect_post_box_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/detect_post_box_cpu.c @@ -84,11 +84,11 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; uint32_t i; - uint32_t n, a, numBatches, numAnchors, lengthBoxEncoding; + vsi_size_t n, a, numBatches, numAnchors, lengthBoxEncoding; uint32_t kRoiDim = 4; float inv_scale_y = 0.0f; float inv_scale_x = 0.0f; @@ -127,8 +127,8 @@ DEF_KERNEL_EXECUTOR(_compute) for ( n = 0; n < numBatches; n++ ) { - int32_t batch_in_offset = n * numAnchors * lengthBoxEncoding; - int32_t batch_out_offset = n * numAnchors * kRoiDim; + vsi_ssize_t batch_in_offset = n * numAnchors * lengthBoxEncoding; + vsi_ssize_t batch_out_offset = n * numAnchors * kRoiDim; for ( a = 0; a < numAnchors; a++ ) { float yCtr = f32_in_buffer[1][a * kRoiDim] + f32_in_buffer[1][a * kRoiDim + 2] diff --git a/src/tim/vx/internal/src/kernel/cpu/detect_post_nms_cpu.c b/src/tim/vx/internal/src/kernel/cpu/detect_post_nms_cpu.c index eaa01d2..4952873 100644 --- a/src/tim/vx/internal/src/kernel/cpu/detect_post_nms_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/detect_post_nms_cpu.c @@ -201,11 +201,11 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; uint32_t i, j; - uint32_t n, a, c, b, numBatches, numAnchors, numClasses; + vsi_size_t n, a, c, b, numBatches, numAnchors, numClasses; int32_t nms_type = 0; int32_t max_num_detections = 0; int32_t maximum_class_per_detection = 0; @@ -213,7 +213,7 @@ DEF_KERNEL_EXECUTOR(_compute) float score_threshold = 0.0f; float iou_threshold = 0.0f; int32_t is_bg_in_label = 0; - uint32_t numOutDetection = 0; + vsi_size_t numOutDetection = 0; /* prepare data */ for ( i = 0; i < _INPUT_NUM; i++ ) @@ -250,11 +250,11 @@ DEF_KERNEL_EXECUTOR(_compute) numOutDetection = out_attr[0]->shape->data[0]; { - uint32_t scores_index = 0; - uint32_t scores_out_index = 0; + vsi_size_t scores_index = 0; + vsi_size_t scores_out_index = 0; uint32_t kRoiDim = 4; - uint32_t roi_out_index = 0; - uint32_t class_out_index = 0; + vsi_size_t roi_out_index = 0; + vsi_size_t class_out_index = 0; uint32_t* select = (uint32_t*)malloc(numAnchors * numClasses * sizeof(uint32_t)); float* maxScores = (float*)malloc(numAnchors * sizeof(float)); uint32_t* scoreInds = (uint32_t*)malloc((numClasses - 1) * sizeof(uint32_t)); @@ -273,10 +273,10 @@ DEF_KERNEL_EXECUTOR(_compute) select_start = select_size; for ( b = 0; b < numAnchors; b++ ) { - const uint32_t index = b * numClasses + c; + const vsi_size_t index = b * numClasses + c; float score = f32_in_buffer[0][scores_index + index]; if (score > score_threshold) { - select[select_size] = index; + select[select_size] = (uint32_t)index; select_size++; } } @@ -297,8 +297,8 @@ DEF_KERNEL_EXECUTOR(_compute) // Calculate IoU of the rest, swap to the end (disgard) if needed. for ( i = j + 1; i < select_len; i++ ) { - int32_t roiBase0 = (select[select_start + i] / numClasses) * kRoiDim; - int32_t roiBase1 = (select[select_start + j] / numClasses) * kRoiDim; + vsi_ssize_t roiBase0 = (select[select_start + i] / numClasses) * kRoiDim; + vsi_ssize_t roiBase1 = (select[select_start + j] / numClasses) * kRoiDim; float iou = _getIoUAxisAligned(&(roiBuffer[roiBase0]), &(roiBuffer[roiBase1])); @@ -335,7 +335,7 @@ DEF_KERNEL_EXECUTOR(_compute) } else { - uint32_t numOutClasses = vsi_nn_min(numClasses - 1, (uint32_t)maximum_class_per_detection); + vsi_size_t numOutClasses = vsi_nn_min(numClasses - 1, (uint32_t)maximum_class_per_detection); uint32_t select_size = 0; uint32_t select_start = 0; uint32_t select_len = 0; @@ -344,10 +344,10 @@ DEF_KERNEL_EXECUTOR(_compute) { // exclude background class: 0 maxScores[a] = _max_element_value(&(f32_in_buffer[0] - [scores_index + a * numClasses + 1]), numClasses - 1); + [scores_index + a * numClasses + 1]), (uint32_t)(numClasses - 1)); if (maxScores[a] > score_threshold) { - select[select_size] = a; + select[select_size] = (uint32_t)a; select_size++; } } @@ -385,9 +385,9 @@ DEF_KERNEL_EXECUTOR(_compute) for ( i = 0; i < select_len; i++ ) { - _iota((int32_t*)scoreInds, numClasses - 1, 1); + _iota((int32_t*)scoreInds, (uint32_t)(numClasses - 1), 1); _sort_element_by_score(&(f32_in_buffer[0][scores_index + select[i] * numClasses]), - scoreInds, numClasses - 1); + scoreInds, (uint32_t)(numClasses - 1)); for (c = 0; c < numOutClasses; c++) { f32_out_buffer[0][scores_out_index + i * numOutClasses + c] = diff --git a/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c b/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c index b3d1562..12c789c 100644 --- a/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/erf_cpu.c @@ -76,9 +76,9 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; size_t i = 0; /* prepare data */ diff --git a/src/tim/vx/internal/src/kernel/cpu/extra_ending_cpu.c b/src/tim/vx/internal/src/kernel/cpu/extra_ending_cpu.c index 978010c..0625cd6 100644 --- a/src/tim/vx/internal/src/kernel/cpu/extra_ending_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/extra_ending_cpu.c @@ -76,9 +76,9 @@ DEF_KERNEL_EXECUTOR(_compute) uint8_t *u8_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL}; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; int32_t i = 0; /* prepare data */ diff --git a/src/tim/vx/internal/src/kernel/cpu/floordiv_cpu.c b/src/tim/vx/internal/src/kernel/cpu/floordiv_cpu.c index 8b3f27d..f8c0ed8 100644 --- a/src/tim/vx/internal/src/kernel/cpu/floordiv_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/floordiv_cpu.c @@ -58,21 +58,21 @@ static vx_param_description_t _floordiv_kernel_param_def[] = }; #define _FLOORDIV_PARAM_NUM _cnt_of_array( _floordiv_kernel_param_def ) -static int32_t _expand_offset +static vsi_ssize_t _expand_offset ( - int32_t index, - int32_t * shape, size_t rank, - size_t * strides, int32_t * out_shape + vsi_ssize_t index, + vsi_size_t * shape, vsi_size_t rank, + vsi_size_t * strides, vsi_size_t * out_shape ) { - uint32_t i; - int32_t offset = 0; + vsi_size_t i; + vsi_ssize_t offset = 0; for( i = 0; i < rank && index; i ++ ) { if( shape[i] == out_shape[i] ) { - offset += (int32_t)strides[i] * ( index % out_shape[i] ); + offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] ); } index /= out_shape[i]; } @@ -96,10 +96,10 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; uint32_t i; /* prepare data */ @@ -126,14 +126,14 @@ DEF_KERNEL_EXECUTOR(_compute) for (i = 0; i < out_elements[0]; i++) { - int32_t in0_offset = 0; - int32_t in1_offset = 0; + vsi_ssize_t in0_offset = 0; + vsi_ssize_t in1_offset = 0; float in0 = 0; float in1 = 0; - in0_offset = _expand_offset( i, in_attr[0]->shape->data, in_attr[0]->shape->size, + in0_offset = _expand_offset( i, in_attr[0]->shape->data, (vsi_size_t)in_attr[0]->shape->size, in_stride_size[0], out_attr[0]->shape->data ); - in1_offset = _expand_offset( i, in_attr[1]->shape->data, in_attr[1]->shape->size, + in1_offset = _expand_offset( i, in_attr[1]->shape->data, (vsi_size_t)in_attr[1]->shape->size, in_stride_size[1], out_attr[0]->shape->data ); in0 = f32_in_buffer[0][in0_offset]; in1 = f32_in_buffer[1][in1_offset]; diff --git a/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c b/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c index 076b6b8..2ea12a5 100644 --- a/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/gather_cpu.c @@ -62,8 +62,9 @@ DEF_KERNEL_EXECUTOR(_gather_exec) uint32_t* buffer_idx = NULL; size_t in_elements = 0, out_elements = 0; vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; - uint32_t i = 0, j = 0; - int32_t block_size = 1, block_num = 1, indices_num = 1, axis_num = 0; + vsi_size_t i = 0, j = 0; + int32_t block_size = 1, block_num = 1, axis_num = 0; + vsi_size_t indices_num = 1; tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; @@ -102,15 +103,15 @@ DEF_KERNEL_EXECUTOR(_gather_exec) indices_num *= attr[1]->shape->data[i]; } - for(i = 0; i < (uint32_t)block_num; i++) + for(i = 0; i < (vsi_size_t)block_num; i++) { - for(j = 0; j < (uint32_t)indices_num; j++) + for(j = 0; j < indices_num; j++) { uint32_t indice = buffer_idx[j]; - uint32_t in_index = (i * axis_num + indice) * block_size; + vsi_size_t in_index = (i * axis_num + indice) * block_size; if(in_index < in_elements) { - uint32_t out_index = (i * indices_num + j) * block_size; + vsi_size_t out_index = (i * indices_num + j) * block_size; memcpy(&(buffer[1][out_index]), &(buffer[0][in_index]), block_size * sizeof(float)); } else diff --git a/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c b/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c index 3d912b8..13d10e7 100644 --- a/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/gather_nd_cpu.c @@ -63,7 +63,8 @@ DEF_KERNEL_EXECUTOR(_gather_nd_exec) size_t out_elements = 0; vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; int32_t i = 0; - int32_t block_size = 1, indices_num = 1; + int32_t block_size = 1; + vsi_ssize_t indices_num = 1; int32_t coord_stride = 1; tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; @@ -101,7 +102,7 @@ DEF_KERNEL_EXECUTOR(_gather_nd_exec) if(coord_stride <= 3) // reshape 3D { - int32_t stride[3] = {block_size, 0, 0}; + vsi_ssize_t stride[3] = {block_size, 0, 0}; for(i = 1; i < coord_stride; ++i) { stride[i] = stride[i - 1] * attr[0]->shape->data[i]; @@ -109,9 +110,9 @@ DEF_KERNEL_EXECUTOR(_gather_nd_exec) for(i = 0; i < indices_num; i++) { - uint32_t out_index = i * block_size; + vsi_size_t out_index = i * block_size; uint32_t coord[3] = {0}; - uint32_t in_index = 0; + vsi_size_t in_index = 0; int32_t j = 0; for(j = 0; j < coord_stride; j++) diff --git a/src/tim/vx/internal/src/kernel/cpu/group_normalization_cpu.c b/src/tim/vx/internal/src/kernel/cpu/group_normalization_cpu.c index 17f45d7..570a1a2 100644 --- a/src/tim/vx/internal/src/kernel/cpu/group_normalization_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/group_normalization_cpu.c @@ -102,19 +102,19 @@ DEF_KERNEL_EXECUTOR(_group_norm_exec) memset( buffer[3], 0, out_elements * sizeof(float) ); { - uint32_t b = 0, c = 0; - uint32_t height = attr[0]->shape->data[1]; - uint32_t width = attr[0]->shape->data[0]; - uint32_t ch = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1; - uint32_t bh = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1; - uint32_t spatial = height * width; + vsi_size_t b = 0, c = 0; + vsi_size_t height = attr[0]->shape->data[1]; + vsi_size_t width = attr[0]->shape->data[0]; + vsi_size_t ch = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1; + vsi_size_t bh = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1; + vsi_size_t spatial = height * width; for (b = 0; b < bh; b++) { for (c = 0; c < ch; c++) { - uint32_t page = c * spatial + b * (spatial * ch); - uint32_t paraIdx = c * attr[1]->shape->data[0]; + vsi_size_t page = c * spatial + b * (spatial * ch); + vsi_size_t paraIdx = c * attr[1]->shape->data[0]; float sum = .0f; float sumsq = .0f; float mean = .0f; @@ -123,14 +123,14 @@ DEF_KERNEL_EXECUTOR(_group_norm_exec) for (i = 0; i < spatial; i++) { - uint32_t index = page + i; + vsi_size_t index = page + i; sum += buffer[0][index]; } mean = sum / spatial; for (i = 0; i < spatial; i++) { - uint32_t index = page + i; + vsi_size_t index = page + i; data = buffer[0][index] - mean; sumsq += data * data; } @@ -141,8 +141,8 @@ DEF_KERNEL_EXECUTOR(_group_norm_exec) for (i = 0; i < spatial; i++) { float normVal = 0; - uint32_t index = page + i; - uint32_t tmpIdx = paraIdx + i / spaceOrg; + vsi_size_t index = page + i; + vsi_size_t tmpIdx = paraIdx + i / spaceOrg; float scaleVal = buffer[2][tmpIdx]; float biasVal = buffer[1][tmpIdx]; @@ -215,14 +215,14 @@ static vsi_status _query_kernel static int32_t _optimize_gn_shape_cpu ( vsi_nn_tensor_t ** inputs, - int32_t group_size, + vsi_size_t group_size, int32_t group_num, - int32_t* opt_shape + vsi_size_t* opt_shape ) { vsi_status status = VSI_SUCCESS; - int32_t group_shape[VSI_NN_MAX_DIM_NUM] = {0}; - int32_t new_rank = 0; + vsi_size_t group_shape[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t new_rank = 0; group_shape[0] = inputs[0]->attr.size[0]; group_shape[1] = inputs[0]->attr.size[1]; group_shape[2] = group_size; @@ -257,10 +257,10 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL; - int32_t new_shape[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 }; + vsi_size_t new_shape[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 }; int32_t group_num = vsi_nn_kernel_param_get_int32( params, "group_num" ); - int32_t group_size = inputs[0]->attr.size[2] / group_num; - int32_t spaceOrg = inputs[0]->attr.size[0] * inputs[0]->attr.size[1]; + vsi_size_t group_size = inputs[0]->attr.size[2] / group_num; + int32_t spaceOrg = (int32_t)(inputs[0]->attr.size[0] * inputs[0]->attr.size[1]); status = _optimize_gn_shape_cpu(inputs, group_size, group_num, new_shape); if ( VSI_SUCCESS != status ) diff --git a/src/tim/vx/internal/src/kernel/cpu/grucell_activation_cpu.c b/src/tim/vx/internal/src/kernel/cpu/grucell_activation_cpu.c index e4fd5cb..d3dc822 100644 --- a/src/tim/vx/internal/src/kernel/cpu/grucell_activation_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/grucell_activation_cpu.c @@ -94,9 +94,9 @@ DEF_KERNEL_EXECUTOR(_compute) size_t param_size ) { - int32_t i = 0; - int32_t batch = 0; - int32_t hidden_units = 0; + vsi_ssize_t i = 0; + vsi_ssize_t batch = 0; + vsi_ssize_t hidden_units = 0; float * buffer[_IO_COUNT_DEFAULT] = { NULL }; vsi_status status = VSI_FAILURE; vsi_nn_activation_e gate_activation; @@ -170,9 +170,9 @@ DEF_KERNEL_EXECUTOR(_compute_separated) size_t param_size ) { - int32_t i = 0, j = 0; - int32_t batch = 0; - int32_t hidden_units = 0; + vsi_ssize_t i = 0, j = 0; + vsi_ssize_t batch = 0; + vsi_ssize_t hidden_units = 0; float * buffer[_IO_COUNT_SEPARATED] = { NULL }; vsi_status status = VSI_FAILURE; vsi_nn_activation_e gate_activation; diff --git a/src/tim/vx/internal/src/kernel/cpu/grucell_activation_sma_cpu.c b/src/tim/vx/internal/src/kernel/cpu/grucell_activation_sma_cpu.c index 5b6f715..0d9c46c 100644 --- a/src/tim/vx/internal/src/kernel/cpu/grucell_activation_sma_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/grucell_activation_sma_cpu.c @@ -69,9 +69,9 @@ DEF_KERNEL_EXECUTOR(_compute) ) { vsi_status status = VSI_FAILURE; - int32_t i = 0; - int32_t batch = 0; - int32_t hidden_units = 0; + vsi_size_t i = 0; + vsi_size_t batch = 0; + vsi_size_t hidden_units = 0; float * buffer[_IO_NUM] = { NULL }; vsi_nn_kernel_tensor_t tensors[_IO_NUM] = { NULL }; vsi_nn_kernel_tensor_attr_t* attr[_IO_NUM] = { NULL }; diff --git a/src/tim/vx/internal/src/kernel/cpu/heatmap_max_keypoint_cpu.c b/src/tim/vx/internal/src/kernel/cpu/heatmap_max_keypoint_cpu.c index 791926b..9af5fa4 100644 --- a/src/tim/vx/internal/src/kernel/cpu/heatmap_max_keypoint_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/heatmap_max_keypoint_cpu.c @@ -126,15 +126,15 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; uint32_t i = 0; uint32_t j = 0; uint32_t k = 0; - uint32_t numBoxes = 0; - uint32_t heatmapSize = 0; - uint32_t numKeypoints = 0; + vsi_size_t numBoxes = 0; + vsi_size_t heatmapSize = 0; + vsi_size_t numKeypoints = 0; uint32_t boxInfoLength = 4; uint32_t output_score_index = 0; uint32_t output_keypoint_index = 0; @@ -171,8 +171,8 @@ DEF_KERNEL_EXECUTOR(_compute) { uint32_t maxIndex = 0; float maxScore = -FLT_MAX; - uint32_t maxIndexWidth; - uint32_t maxIndexHeight; + vsi_size_t maxIndexWidth; + vsi_size_t maxIndexHeight; float localGrid[3][3] = {{0}}; int32_t dh, dw; float delta[2] = {0.0f, 0.0f}, deltaScore; @@ -186,7 +186,7 @@ DEF_KERNEL_EXECUTOR(_compute) float hRelativePos; for (k = 0; k < heatmapSize * heatmapSize; k++) { - uint32_t index = i * heatmapSize * heatmapSize * numKeypoints + vsi_size_t index = i * heatmapSize * heatmapSize * numKeypoints + k * numKeypoints + j; float val = f32_in_buffer[0][index]; if (maxScore < val) @@ -204,18 +204,18 @@ DEF_KERNEL_EXECUTOR(_compute) for (dw = -1; dw <= 1; dw++) { // cast uint32_t to int32_t - int32_t h = (int32_t)(maxIndexHeight) + dh; - int32_t w = (int32_t)(maxIndexWidth) + dw; - uint32_t heatmapIndex; + vsi_ssize_t h = (vsi_ssize_t)(maxIndexHeight) + dh; + vsi_ssize_t w = (vsi_ssize_t)(maxIndexWidth) + dw; + vsi_size_t heatmapIndex; // use mirroring for out of bound indexing // need to ensure heatmapSize >= 2 - h = h < 0 ? 1 : (h >= (int32_t)heatmapSize ? heatmapSize - 2 : h); - w = w < 0 ? 1 : (w >= (int32_t)heatmapSize ? heatmapSize - 2 : w); + h = h < 0 ? 1 : (h >= (vsi_ssize_t)heatmapSize ? heatmapSize - 2 : h); + w = w < 0 ? 1 : (w >= (vsi_ssize_t)heatmapSize ? heatmapSize - 2 : w); heatmapIndex = i * heatmapSize * heatmapSize * numKeypoints + - (uint32_t)(h) * heatmapSize * numKeypoints + - (uint32_t)(w) * numKeypoints + j; + (vsi_size_t)(h) * heatmapSize * numKeypoints + + (vsi_size_t)(w) * numKeypoints + j; localGrid[dh + 1][dw + 1] = f32_in_buffer[0][heatmapIndex]; } } diff --git a/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c b/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c index c9b665c..2744643 100644 --- a/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/instance_normalization_cpu.c @@ -97,17 +97,17 @@ DEF_KERNEL_EXECUTOR(_instance_norm_exec) memset( buffer[3], 0, out_elements * sizeof(float) ); { - uint32_t b = 0, c = 0, h = 0, w = 0; - uint32_t height = attr[0]->shape->data[1]; - uint32_t width = attr[0]->shape->data[0]; - uint32_t ch = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1; - uint32_t bh = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1; + vsi_size_t b = 0, c = 0, h = 0, w = 0; + vsi_size_t height = attr[0]->shape->data[1]; + vsi_size_t width = attr[0]->shape->data[0]; + vsi_size_t ch = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1; + vsi_size_t bh = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1; for (b = 0; b < bh; b++) { for (c = 0; c < ch; c++) { - uint32_t page = c * (height * width) + b * (height * width * ch); + vsi_size_t page = c * (height * width) + b * (height * width * ch); float sum = .0f; float sumsq = .0f; float mean = .0f; @@ -118,21 +118,21 @@ DEF_KERNEL_EXECUTOR(_instance_norm_exec) for (h = 0; h < height; h++) { - uint32_t len = page + h * width; + vsi_size_t len = page + h * width; for (w = 0; w < width; w++) { - uint32_t index = len + w; + vsi_size_t index = len + w; sum += buffer[0][index]; } } mean = sum / (width * height); for (h = 0; h < height; h++) { - uint32_t len = page + h * width; + vsi_size_t len = page + h * width; for (w = 0; w < width; w++) { - uint32_t index = len + w; + vsi_size_t index = len + w; data = buffer[0][index] - mean; sumsq += data * data; } @@ -141,11 +141,11 @@ DEF_KERNEL_EXECUTOR(_instance_norm_exec) vari = (float)(1.0 / sqrtf(vari + eps)); for (h = 0; h < height; h++) { - uint32_t len = page + h * width; + vsi_size_t len = page + h * width; for (w = 0; w < width; w++) { float normVal = 0; - uint32_t index = len + w; + vsi_size_t index = len + w; data = buffer[0][index] - mean; normVal = data * vari * scaleVal + biasVal; diff --git a/src/tim/vx/internal/src/kernel/cpu/l2normalizescale_cpu.c b/src/tim/vx/internal/src/kernel/cpu/l2normalizescale_cpu.c index f3f096a..79a0e30 100644 --- a/src/tim/vx/internal/src/kernel/cpu/l2normalizescale_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/l2normalizescale_cpu.c @@ -79,15 +79,16 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; - uint32_t i, index; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; + uint32_t i; + vsi_ssize_t index; int32_t axis = 0; - int32_t outerSize = 1; - int32_t axisSize = 1; - int32_t innerSize = 1; - int32_t inner = 0; + vsi_ssize_t outerSize = 1; + vsi_ssize_t axisSize = 1; + vsi_ssize_t innerSize = 1; + vsi_ssize_t inner = 0; int32_t outer = 0; float rsqrt = 0.0f, scaleValue = 0.0f; float epsilon = (float)10e-12; diff --git a/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c b/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c index c8c82bf..dffd119 100644 --- a/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/layer_normalization_cpu.c @@ -61,7 +61,7 @@ DEF_KERNEL_EXECUTOR(_layer_norm_exec) float * buffer[_CPU_IO_NUM] = { NULL }; size_t out_elements = 0; vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; - uint32_t i = 0; + vsi_size_t i = 0; float eps = .0f; tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; @@ -97,25 +97,25 @@ DEF_KERNEL_EXECUTOR(_layer_norm_exec) memset( buffer[3], 0, out_elements * sizeof(float) ); { - uint32_t axis_first = 0; - uint32_t axis_num = 1; - uint32_t outerSize = 1; - uint32_t axisSize = 1; - uint32_t innerSize = 1; - uint32_t inner = 0; - uint32_t outer = 0; + vsi_size_t axis_first = 0; + vsi_size_t axis_num = 1; + vsi_size_t outerSize = 1; + vsi_size_t axisSize = 1; + vsi_size_t innerSize = 1; + vsi_size_t inner = 0; + vsi_size_t outer = 0; - for (i = 0; i < (uint32_t)axis_first; i++) + for (i = 0; i < axis_first; i++) { innerSize *= attr[0]->shape->data[i]; } - for(i = 0; i < (uint32_t)axis_num; i++) + for(i = 0; i < axis_num; i++) { axisSize *= attr[0]->shape->data[axis_first + i]; } - for (i = (uint32_t)axis_first + axis_num; i < attr[0]->shape->size; i++) + for (i = axis_first + axis_num; i < attr[0]->shape->size; i++) { outerSize *= attr[0]->shape->data[i]; } @@ -129,7 +129,7 @@ DEF_KERNEL_EXECUTOR(_layer_norm_exec) float mean = .0f; float vari = .0f; - for (i = 0; i < (uint32_t)axisSize; ++i) + for (i = 0; i < axisSize; ++i) { float value = buffer[0][(outer * axisSize + i) * innerSize + inner]; sum += value; @@ -139,9 +139,9 @@ DEF_KERNEL_EXECUTOR(_layer_norm_exec) vari = sumsq / (axisSize) - mean * mean; vari = (float)(1.0 / sqrtf(vari + eps)); - for (i = 0; i < (uint32_t)axisSize; ++i) + for (i = 0; i < axisSize; ++i) { - int idx = (outer * axisSize + i) * innerSize + inner; + vsi_ssize_t idx = (outer * axisSize + i) * innerSize + inner; float data = buffer[0][idx] - mean; float scaleVal = buffer[2][i]; float biasVal = buffer[1][i]; diff --git a/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c b/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c index 2ef240f..1733ac9 100644 --- a/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/log_softmax_cpu.c @@ -60,12 +60,12 @@ DEF_KERNEL_EXECUTOR(_log_softmax_exec) vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; int32_t axis = 0; float beta = 0; - int32_t outerSize = 1; - int32_t axisSize = 1; - int32_t innerSize = 1; - int32_t i = 0; - int32_t inner = 0; - int32_t outer = 0; + vsi_ssize_t outerSize = 1; + vsi_ssize_t axisSize = 1; + vsi_ssize_t innerSize = 1; + vsi_ssize_t i = 0; + vsi_ssize_t inner = 0; + vsi_ssize_t outer = 0; tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; @@ -93,7 +93,7 @@ DEF_KERNEL_EXECUTOR(_log_softmax_exec) axisSize = attr[0]->shape->data[axis]; - for (i = axis + 1; i < (int32_t)attr[0]->shape->size; i++) + for (i = axis + 1; i < (vsi_ssize_t)attr[0]->shape->size; i++) { outerSize *= attr[0]->shape->data[i]; } diff --git a/src/tim/vx/internal/src/kernel/cpu/logical_not_cpu.c b/src/tim/vx/internal/src/kernel/cpu/logical_not_cpu.c index c3917a6..b03a413 100644 --- a/src/tim/vx/internal/src/kernel/cpu/logical_not_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/logical_not_cpu.c @@ -74,9 +74,9 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; uint32_t i; /* prepare data */ for(i = 0; i < _INPUT_NUM; i ++) diff --git a/src/tim/vx/internal/src/kernel/cpu/logical_ops_cpu.c b/src/tim/vx/internal/src/kernel/cpu/logical_ops_cpu.c index 8e89e7a..d90d1af 100644 --- a/src/tim/vx/internal/src/kernel/cpu/logical_ops_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/logical_ops_cpu.c @@ -60,21 +60,21 @@ static vx_param_description_t _logical_ops_kernel_param_def[] = #define _LOGICAL_OPS_PARAM_NUM _cnt_of_array( _logical_ops_kernel_param_def ) -static int32_t _expand_offset +static vsi_ssize_t _expand_offset ( - int32_t index, - int32_t * shape, size_t rank, - size_t * strides, int32_t * out_shape + vsi_ssize_t index, + vsi_size_t * shape, vsi_size_t rank, + vsi_size_t * strides, vsi_size_t * out_shape ) { - uint32_t i; - int32_t offset = 0; + vsi_size_t i; + vsi_ssize_t offset = 0; for( i = 0; i < rank && index; i ++ ) { if( shape[i] == out_shape[i] ) { - offset += (int32_t)strides[i] * ( index % out_shape[i] ); + offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] ); } index /= out_shape[i]; } @@ -98,10 +98,10 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; uint32_t i; uint32_t ops_type_int = 0; vsi_nn_logical_ops_type_t ops_type = VSI_NN_LOGICAL_OR; @@ -136,15 +136,15 @@ DEF_KERNEL_EXECUTOR(_compute) for (i = 0; i < out_elements[0]; i++) { - int32_t in0_offset = 0; - int32_t in1_offset = 0; - int32_t in0 = 0; - int32_t in1 = 0; + vsi_ssize_t in0_offset = 0; + vsi_ssize_t in1_offset = 0; + vsi_ssize_t in0 = 0; + vsi_ssize_t in1 = 0; - in0_offset = _expand_offset( i, in_attr[0]->shape->data, in_attr[0]->shape->size, + in0_offset = _expand_offset( i, in_attr[0]->shape->data, (vsi_size_t)in_attr[0]->shape->size, in_stride_size[0], out_attr[0]->shape->data ); - in1_offset = _expand_offset( i, in_attr[1]->shape->data, in_attr[1]->shape->size, + in1_offset = _expand_offset( i, in_attr[1]->shape->data, (vsi_size_t)in_attr[1]->shape->size, in_stride_size[1], out_attr[0]->shape->data ); in0 = (!!(f32_in_buffer[0][in0_offset])); in1 = (!!(f32_in_buffer[1][in1_offset])); diff --git a/src/tim/vx/internal/src/kernel/cpu/lstmunit_activation_cpu.c b/src/tim/vx/internal/src/kernel/cpu/lstmunit_activation_cpu.c index 8bd6d3b..2cc2209 100644 --- a/src/tim/vx/internal/src/kernel/cpu/lstmunit_activation_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/lstmunit_activation_cpu.c @@ -131,19 +131,19 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL}; - size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; - uint32_t i, b; + vsi_size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t i, b; int32_t _is_ln = 0; int32_t _is_cifg = 0; int32_t _is_proj = 0; int32_t _is_hybrid = 0; int32_t recurrent_activation; vsi_nn_activation_e activation_mode; - uint32_t n_batch = 0; - uint32_t n_cell = 0; + vsi_size_t n_batch = 0; + vsi_size_t n_cell = 0; float forget_bias; /* prepare data */ for( i = 0; i < _INPUT_NUM; i++ ) @@ -195,7 +195,7 @@ DEF_KERNEL_EXECUTOR(_compute) { for (i = 0; i < n_cell; i++) { - uint32_t index = i + n_cell * b; + vsi_size_t index = i + n_cell * b; float data_i_t = 0; float data_f_t = 0; float data_g_t = 0; diff --git a/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c b/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c index c263ff7..20130bb 100644 --- a/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/matrixmul_cpu.c @@ -61,10 +61,10 @@ DEF_KERNEL_EXECUTOR(_matrixmul_exec) float * buffer[3] = { NULL }; size_t out_elements = 0; vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; - int32_t i = 0; - int32_t M = 0, K = 0, N = 0; + vsi_size_t i = 0; + vsi_size_t M = 0, K = 0, N = 0; int32_t transposeA = 0, transposeB = 0; - vx_size strides0[2] = {0, 0}, strides1[2] = {0, 0}; + size_t strides0[2] = {0, 0}, strides1[2] = {0, 0}; tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; @@ -110,14 +110,14 @@ DEF_KERNEL_EXECUTOR(_matrixmul_exec) strides1[1] = transposeB? K:1; { - int32_t batch = attr[2]->shape->size > 3 ? attr[2]->shape->data[3] : 1; - int32_t depth = attr[2]->shape->size > 2 ? attr[2]->shape->data[2] : 1; - int32_t a_depth = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1; - int32_t b_depth = attr[1]->shape->size > 2 ? attr[1]->shape->data[2] : 1; - int32_t b = 0, c = 0, j = 0, y = 0; - int32_t offsetA = 0, offsetB = 0, offsetD = 0; - int32_t ac2zero = 1; - int32_t bc2zero = 1; + vsi_size_t batch = attr[2]->shape->size > 3 ? attr[2]->shape->data[3] : 1; + vsi_size_t depth = attr[2]->shape->size > 2 ? attr[2]->shape->data[2] : 1; + vsi_size_t a_depth = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1; + vsi_size_t b_depth = attr[1]->shape->size > 2 ? attr[1]->shape->data[2] : 1; + vsi_size_t b = 0, c = 0, j = 0, y = 0; + vsi_size_t offsetA = 0, offsetB = 0, offsetD = 0; + vsi_size_t ac2zero = 1; + vsi_size_t bc2zero = 1; if((attr[0]->shape->size > attr[1]->shape->size) || (attr[0]->shape->data[2] > attr[1]->shape->data[2] diff --git a/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c b/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c index 4795735..74a06ae 100644 --- a/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/maximum_cpu.c @@ -47,21 +47,21 @@ __BEGIN_DECLS #define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) #define _KERNEL_NAME CVIVANTE_NAMESPACE("maximum_sw") -static int32_t _expand_offset +static vsi_ssize_t _expand_offset ( - int32_t index, - int32_t * shape, size_t rank, - size_t * strides, int32_t * out_shape + vsi_ssize_t index, + vsi_size_t * shape, vsi_size_t rank, + vsi_size_t * strides, vsi_size_t * out_shape ) { - uint32_t i; - int32_t offset = 0; + vsi_size_t i; + vsi_ssize_t offset = 0; for( i = 0; i < rank && index; i ++ ) { if( shape[i] == out_shape[i] ) { - offset += (uint32_t)strides[i] * ( index % out_shape[i] ); + offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] ); } index /= out_shape[i]; } @@ -78,8 +78,8 @@ DEF_KERNEL_EXECUTOR(_maximum_exec) vsi_status status = VX_SUCCESS; vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; float * buffer[_CPU_IO_NUM] = { NULL }; - size_t out_elements = 0; - size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_size_t out_elements = 0; + vsi_size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; uint32_t i; @@ -108,14 +108,14 @@ DEF_KERNEL_EXECUTOR(_maximum_exec) for (i = 0; i < out_elements; i++) { - int32_t in0_offset = 0; - int32_t in1_offset = 0; + vsi_ssize_t in0_offset = 0; + vsi_ssize_t in1_offset = 0; float val1 = 0.f; float val2 = 0.f; - in0_offset = _expand_offset( i, attr[0]->shape->data, attr[0]->shape->size, + in0_offset = _expand_offset( i, attr[0]->shape->data, (vsi_size_t)attr[0]->shape->size, stride_size[0], attr[2]->shape->data ); - in1_offset = _expand_offset( i, attr[1]->shape->data, attr[1]->shape->size, + in1_offset = _expand_offset( i, attr[1]->shape->data, (vsi_size_t)attr[1]->shape->size, stride_size[1], attr[2]->shape->data ); val1 = buffer[0][in0_offset]; diff --git a/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c b/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c index 6908a1e..cfac7cd 100644 --- a/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/minimum_cpu.c @@ -43,21 +43,21 @@ __BEGIN_DECLS #define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) #define _KERNEL_NAME CVIVANTE_NAMESPACE("minimum_sw") -static int32_t _expand_offset +static vsi_ssize_t _expand_offset ( - int32_t index, - int32_t * shape, size_t rank, - size_t * strides, int32_t * out_shape + vsi_ssize_t index, + vsi_size_t * shape, vsi_size_t rank, + vsi_size_t * strides, vsi_size_t * out_shape ) { - uint32_t i; - int32_t offset = 0; + vsi_size_t i; + vsi_ssize_t offset = 0; for( i = 0; i < rank && index; i ++ ) { if( shape[i] == out_shape[i] ) { - offset += (uint32_t)strides[i] * ( index % out_shape[i] ); + offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] ); } index /= out_shape[i]; } @@ -74,8 +74,8 @@ DEF_KERNEL_EXECUTOR(_minimum_exec) vsi_status status = VX_SUCCESS; vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; float * buffer[_CPU_IO_NUM] = { NULL }; - size_t out_elements = 0; - size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_size_t out_elements = 0; + vsi_size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; uint32_t i; @@ -104,14 +104,14 @@ DEF_KERNEL_EXECUTOR(_minimum_exec) for( i = 0; i < out_elements; i ++ ) { - int32_t in0_offset = 0; - int32_t in1_offset = 0; + vsi_ssize_t in0_offset = 0; + vsi_ssize_t in1_offset = 0; float val1 = 0.f; float val2 = 0.f; - in0_offset = _expand_offset( i, attr[0]->shape->data, attr[0]->shape->size, + in0_offset = _expand_offset( i, attr[0]->shape->data, (vsi_size_t)attr[0]->shape->size, stride_size[0], attr[2]->shape->data ); - in1_offset = _expand_offset( i, attr[1]->shape->data, attr[1]->shape->size, + in1_offset = _expand_offset( i, attr[1]->shape->data, (vsi_size_t)attr[1]->shape->size, stride_size[1], attr[2]->shape->data ); val1 = buffer[0][in0_offset]; diff --git a/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c b/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c index ad46c58..981342c 100644 --- a/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/moments_cpu.c @@ -61,7 +61,7 @@ DEF_KERNEL_EXECUTOR(_moments_exec) float * buffer[_CPU_IO_NUM] = { NULL }; size_t out_elements = 0; vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; - uint32_t i = 0; + vsi_size_t i = 0; int32_t axis_first = 0; int32_t axis_num = 0; uint32_t mask = 0; @@ -99,23 +99,23 @@ DEF_KERNEL_EXECUTOR(_moments_exec) if(mask == 0) { - int32_t outerSize = 1; - int32_t axisSize = 1; - int32_t innerSize = 1; - int32_t inner = 0; - int32_t outer = 0; + vsi_size_t outerSize = 1; + vsi_size_t axisSize = 1; + vsi_size_t innerSize = 1; + vsi_size_t inner = 0; + vsi_size_t outer = 0; - for (i = 0; i < (uint32_t)axis_first; i++) + for (i = 0; i < (vsi_size_t)axis_first; i++) { innerSize *= attr[0]->shape->data[i]; } - for(i = 0; i < (uint32_t)axis_num; i++) + for(i = 0; i < (vsi_size_t)axis_num; i++) { axisSize *= attr[0]->shape->data[axis_first + i]; } - for (i = (uint32_t)axis_first + axis_num; i < attr[0]->shape->size; i++) + for (i = (vsi_size_t)axis_first + axis_num; i < attr[0]->shape->size; i++) { outerSize *= attr[0]->shape->data[i]; } @@ -129,7 +129,7 @@ DEF_KERNEL_EXECUTOR(_moments_exec) float mean = .0f; float vari = .0f; - for (i = 0; i < (uint32_t)axisSize; ++i) + for (i = 0; i < axisSize; ++i) { float value = buffer[0][(outer * axisSize + i) * innerSize + inner]; sum += value; @@ -144,17 +144,17 @@ DEF_KERNEL_EXECUTOR(_moments_exec) } else { - int32_t width = attr[0]->shape->data[0]; - int32_t height = attr[0]->shape->size > 1 ? attr[0]->shape->data[1] : 1; - int32_t channel = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1; - int32_t batch = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1; - int32_t width_o = attr[1]->shape->data[0]; - int32_t height_o = attr[1]->shape->size > 1 ? attr[1]->shape->data[1] : 1; - int32_t channel_o = attr[1]->shape->size > 2 ? attr[1]->shape->data[2] : 1; - int32_t b = 0, c = 0, h = 0; - int32_t wh_offset = width * height; - int32_t axisSize = width * channel; - int32_t vol = width_o * height_o * channel_o; + vsi_size_t width = attr[0]->shape->data[0]; + vsi_size_t height = attr[0]->shape->size > 1 ? attr[0]->shape->data[1] : 1; + vsi_size_t channel = attr[0]->shape->size > 2 ? attr[0]->shape->data[2] : 1; + vsi_size_t batch = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1; + vsi_size_t width_o = attr[1]->shape->data[0]; + vsi_size_t height_o = attr[1]->shape->size > 1 ? attr[1]->shape->data[1] : 1; + vsi_size_t channel_o = attr[1]->shape->size > 2 ? attr[1]->shape->data[2] : 1; + vsi_size_t b = 0, c = 0, h = 0; + vsi_size_t wh_offset = width * height; + vsi_size_t axisSize = width * channel; + vsi_size_t vol = width_o * height_o * channel_o; for(b = 0; b < batch; b++) { @@ -164,11 +164,11 @@ DEF_KERNEL_EXECUTOR(_moments_exec) float sumsq = .0f; float mean = .0f; float vari = .0f; - int h_offset = h * width; + vsi_size_t h_offset = h * width; for(c = 0; c < channel; c++) { - int offset = h_offset + c * wh_offset; - for(i = 0; i < (uint32_t)width; i++) + vsi_size_t offset = h_offset + c * wh_offset; + for(i = 0; i < width; i++) { float value = buffer[0][i + offset]; sum += value; diff --git a/src/tim/vx/internal/src/kernel/cpu/nms_cpu.c b/src/tim/vx/internal/src/kernel/cpu/nms_cpu.c index 8924f7b..62e695f 100644 --- a/src/tim/vx/internal/src/kernel/cpu/nms_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/nms_cpu.c @@ -172,8 +172,8 @@ DEF_KERNEL_EXECUTOR(_compute) vsi_nn_kernel_tensor_t output[_OUTPUT_NUM] = {NULL}; float * buffer[_INPUT_NUM] = { NULL }; float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; - size_t stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; - size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; vsi_nn_kernel_tensor_attr_t * attr[_INPUT_NUM] = { NULL }; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL}; int32_t i = 0; @@ -225,7 +225,7 @@ DEF_KERNEL_EXECUTOR(_compute) memset( f32_out_buffer[i], 0, out_elements[i] * sizeof(float) ); } - num_boxes = attr[0]->shape->data[1]; + num_boxes = (int32_t)attr[0]->shape->data[1]; boxes = buffer[0]; scores = buffer[1]; selected_indices = f32_out_buffer[0]; diff --git a/src/tim/vx/internal/src/kernel/cpu/one_hot_cpu.c b/src/tim/vx/internal/src/kernel/cpu/one_hot_cpu.c index 6a46178..146b093 100644 --- a/src/tim/vx/internal/src/kernel/cpu/one_hot_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/one_hot_cpu.c @@ -83,16 +83,16 @@ DEF_KERNEL_EXECUTOR(_compute) float * buffer[_IO_NUM] = { NULL }; size_t out_elements = 0; vsi_nn_kernel_tensor_attr_t * attr[_IO_NUM] = { NULL }; - int32_t i = 0; - int32_t j = 0; - int32_t k = 0; + vsi_size_t i = 0; + int32_t j = 0, m = 0; + vsi_size_t k = 0; int32_t index = 0; int32_t depth = 0; float on_value = 0; float off_value = 0; int32_t axis = 0; - int32_t prefix_dim_size = 1; - int32_t suffix_dim_size = 0; + vsi_size_t prefix_dim_size = 1; + vsi_size_t suffix_dim_size = 0; int32_t num_elements = 0; tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; @@ -120,9 +120,9 @@ DEF_KERNEL_EXECUTOR(_compute) axis = axis == -1 ? (int32_t)attr[0]->shape->size : (int32_t)attr[0]->shape->size - axis; - for (i = 0; i < axis; i++) + for (m = 0; m < axis; m++) { - prefix_dim_size *= attr[0]->shape->data[i]; + prefix_dim_size *= attr[0]->shape->data[m]; } suffix_dim_size = num_elements / prefix_dim_size; diff --git a/src/tim/vx/internal/src/kernel/cpu/poolwithargmax_cpu.c b/src/tim/vx/internal/src/kernel/cpu/poolwithargmax_cpu.c index 84b0ff8..3a8feca 100644 --- a/src/tim/vx/internal/src/kernel/cpu/poolwithargmax_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/poolwithargmax_cpu.c @@ -88,20 +88,20 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; - int32_t i, j, b, p; - int32_t batch, depth_v, height_o, width_o, height, width; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t i, j, b, p; + vsi_size_t batch, depth_v, height_o, width_o, height, width; int32_t ksize_x = 0; int32_t ksize_y = 0; int32_t stride_x = 0; int32_t stride_y = 0; int32_t pad_x = 0; int32_t pad_y = 0; - int32_t output_base = 0; - int32_t input_base = 0; - int32_t max_index = 0; + vsi_size_t output_base = 0; + vsi_size_t input_base = 0; + vsi_ssize_t max_index = 0; vsi_nn_kernel_dtype_e out1_dtype; vsi_bool is_relative_coord = FALSE; @@ -159,15 +159,15 @@ DEF_KERNEL_EXECUTOR(_compute) { for (i = 0; i < width_o; i ++) { - int32_t hstart = j * stride_y - pad_y; - int32_t wstart = i * stride_x - pad_x; - int32_t hoffset = 0; - int32_t woffset = 0; - int32_t hend = vsi_nn_min(hstart + ksize_y, height); - int32_t wend = vsi_nn_min(wstart + ksize_x, width); - int32_t pool_index = 0; - int32_t h, w = 0; - int32_t cur_index = 0; + vsi_ssize_t hstart = j * stride_y - pad_y; + vsi_ssize_t wstart = i * stride_x - pad_x; + vsi_size_t hoffset = 0; + vsi_size_t woffset = 0; + vsi_size_t hend = vsi_nn_min(hstart + ksize_y, (vsi_ssize_t)height); + vsi_size_t wend = vsi_nn_min(wstart + ksize_x, (vsi_ssize_t)width); + vsi_size_t pool_index = 0; + vsi_size_t h, w = 0; + vsi_size_t cur_index = 0; float d_f32 = 0.0f; if (hstart < 0) @@ -191,7 +191,7 @@ DEF_KERNEL_EXECUTOR(_compute) cur_index = (h - hstart + hoffset) * ksize_x + woffset; for (w = wstart; w < wend; ++ w) { - int32_t index = input_base + h * width + w; + vsi_ssize_t index = input_base + h * width + w; float d; d = f32_in_buffer[0][index]; diff --git a/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c index 902d40e..77d1036 100644 --- a/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/pow_cpu.c @@ -43,21 +43,21 @@ __BEGIN_DECLS #define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) #define _KERNEL_NAME CVIVANTE_NAMESPACE("pow_sw") -static int32_t _expand_offset +static vsi_ssize_t _expand_offset ( - int32_t index, - int32_t * shape, size_t rank, - size_t * strides, int32_t * out_shape + vsi_ssize_t index, + vsi_size_t * shape, vsi_size_t rank, + vsi_size_t * strides, vsi_size_t * out_shape ) { - uint32_t i = 0; - int32_t offset = 0; + vsi_size_t i = 0; + vsi_ssize_t offset = 0; for( i = 0; i < rank && index; i ++ ) { if( shape[i] == out_shape[i] ) { - offset += (int32_t)strides[i] * ( index % out_shape[i] ); + offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] ); } index /= out_shape[i]; } @@ -74,8 +74,8 @@ DEF_KERNEL_EXECUTOR(_pow_exec) vsi_status status = VX_SUCCESS; vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; float * buffer[_CPU_IO_NUM] = { NULL }; - size_t out_elements = 0; - size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_size_t out_elements = 0; + vsi_size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; uint32_t i = 0; @@ -107,14 +107,14 @@ DEF_KERNEL_EXECUTOR(_pow_exec) for( i = 0; i < out_elements; i ++ ) { - int32_t in0_offset = 0; - int32_t in1_offset = 0; + vsi_ssize_t in0_offset = 0; + vsi_ssize_t in1_offset = 0; float val1 = 0.f; float val2 = 0.f; - in0_offset = _expand_offset( i, attr[0]->shape->data, attr[0]->shape->size, + in0_offset = _expand_offset( i, attr[0]->shape->data, (vsi_size_t)attr[0]->shape->size, stride_size[0], attr[2]->shape->data ); - in1_offset = _expand_offset( i, attr[1]->shape->data, attr[1]->shape->size, + in1_offset = _expand_offset( i, attr[1]->shape->data, (vsi_size_t)attr[1]->shape->size, stride_size[1], attr[2]->shape->data ); val1 = buffer[0][in0_offset]; diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c index d31f2fc..6a78ee9 100644 --- a/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_bgra_cpu.c @@ -115,12 +115,12 @@ DEF_KERNEL_EXECUTOR(_pre_process_bgra_exec) int32_t gline1[2], gline2[2]; int32_t bline1[2], bline2[2]; int32_t dx = 0, dy = 0, dz = 0; - int32_t src_stride = attr[0]->shape->data[0]; - int32_t src_width = src_stride / elementSize; - int32_t src_height = attr[0]->shape->data[1]; - int32_t dst_width = trans ? attr[1]->shape->data[1] : attr[1]->shape->data[0]; - int32_t dst_height = trans ? attr[1]->shape->data[2] : attr[1]->shape->data[1]; - int32_t stride = dst_width * dst_height; + int32_t src_stride = (int32_t)attr[0]->shape->data[0]; + int32_t src_width = (int32_t)(src_stride / elementSize); + int32_t src_height = (int32_t)attr[0]->shape->data[1]; + int32_t dst_width = (int32_t)(trans ? attr[1]->shape->data[1] : attr[1]->shape->data[0]); + int32_t dst_height = (int32_t)(trans ? attr[1]->shape->data[2] : attr[1]->shape->data[1]); + int32_t stride = (int32_t)(dst_width * dst_height); int32_t bOffset = 0; int32_t gOffset = 1 * stride; int32_t rOffset = 2 * stride; @@ -235,8 +235,8 @@ DEF_KERNEL_EXECUTOR(_pre_process_bgra_exec) if(trans) { - uint32_t shape[] = {attr[1]->shape->data[0], attr[1]->shape->data[1], attr[1]->shape->data[2], 1}; - uint32_t perm[] = {1, 2, 0, 3}; + vsi_size_t shape[] = {attr[1]->shape->data[0], attr[1]->shape->data[1], attr[1]->shape->data[2], 1}; + vsi_size_t perm[] = {1, 2, 0, 3}; vsi_nn_Transpose((uint8_t*)outBuffer, (uint8_t*)buffer[1], shape, (uint32_t)attr[1]->shape->size, perm, VSI_NN_TYPE_FLOAT32); diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c index c615f68..088b6a5 100644 --- a/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_gray_cpu.c @@ -95,10 +95,10 @@ DEF_KERNEL_EXECUTOR(_pre_process_gray_exec) { int32_t line1[2], line2[2]; int32_t dx = 0, dy = 0, dz = 0; - int32_t src_width = attr[0]->shape->data[0]; - int32_t src_height = attr[0]->shape->data[1]; - int32_t dst_width = attr[1]->shape->data[0]; - int32_t dst_height = attr[1]->shape->data[1]; + int32_t src_width = (int32_t)attr[0]->shape->data[0]; + int32_t src_height = (int32_t)attr[0]->shape->data[1]; + int32_t dst_width = (int32_t)attr[1]->shape->data[0]; + int32_t dst_height = (int32_t)attr[1]->shape->data[1]; uint8_t result = 0; for ( dz = 0; dz < 1; dz ++) diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c index 1e4d48d..78c7cf5 100644 --- a/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_nv12_cpu.c @@ -117,11 +117,10 @@ DEF_KERNEL_EXECUTOR(_pre_process_nv12_exec) { int32_t dx, dy, dz; - int32_t src_width = attr[0]->shape->data[0]; - int32_t src_height = attr[0]->shape->data[1]; - int32_t dst_width = trans ? attr[2]->shape->data[1] : attr[2]->shape->data[0]; - int32_t dst_height = trans ? attr[2]->shape->data[2] : attr[2]->shape->data[1]; - int32_t stride = dst_width * dst_height; + int32_t src_width = (int32_t)attr[0]->shape->data[0]; + int32_t dst_width = (int32_t)(trans ? attr[2]->shape->data[1] : attr[2]->shape->data[0]); + int32_t dst_height = (int32_t)(trans ? attr[2]->shape->data[2] : attr[2]->shape->data[1]); + int32_t stride = (int32_t)(dst_width * dst_height); int32_t rOffset = 0; int32_t gOffset = 1 * stride; int32_t bOffset = 2 * stride; @@ -132,9 +131,10 @@ DEF_KERNEL_EXECUTOR(_pre_process_nv12_exec) float* src_y_slice = NULL; float* src_uv_yScanline = NULL; - - uint32_t xrIntFloat_16 = (src_width << 16) / dst_width + 1; - uint32_t yrIntFloat_16 = (src_height << 16) / dst_height + 1; + uint32_t roi_width = (xRatio * dst_width) >> 15; + uint32_t roi_height = (yRatio * dst_height) >> 15; + uint32_t xrIntFloat_16 = (roi_width << 16) / dst_width + 1; + uint32_t yrIntFloat_16 = (roi_height << 16) / dst_height + 1; uint32_t srcy = 0, srcx = 0; if(attr[2]->dtype == I8) @@ -207,8 +207,8 @@ DEF_KERNEL_EXECUTOR(_pre_process_nv12_exec) if(trans) { - uint32_t shape[] = {attr[2]->shape->data[0], attr[2]->shape->data[1], attr[2]->shape->data[2], 1}; - uint32_t perm[] = {1, 2, 0, 3}; + vsi_size_t shape[] = {attr[2]->shape->data[0], attr[2]->shape->data[1], attr[2]->shape->data[2], 1}; + vsi_size_t perm[] = {1, 2, 0, 3}; vsi_nn_Transpose((uint8_t*)outBuffer, (uint8_t*)buffer[2], shape, (uint32_t)attr[2]->shape->size, perm, VSI_NN_TYPE_FLOAT32); diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c index 972172f..b505d0c 100644 --- a/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_rgb_cpu.c @@ -114,12 +114,12 @@ DEF_KERNEL_EXECUTOR(_pre_process_rgb_exec) int32_t gline1[2], gline2[2]; int32_t bline1[2], bline2[2]; int32_t dx = 0, dy = 0, dz = 0; - int32_t src_stride = attr[0]->shape->data[0]; - int32_t src_width = src_stride / 3; - int32_t src_height = attr[0]->shape->data[1]; - int32_t dst_width = trans ? attr[1]->shape->data[1] : attr[1]->shape->data[0]; - int32_t dst_height = trans ? attr[1]->shape->data[2] : attr[1]->shape->data[1]; - int32_t stride = dst_width * dst_height; + int32_t src_stride = (int32_t)attr[0]->shape->data[0]; + int32_t src_width = (int32_t)(src_stride / 3); + int32_t src_height = (int32_t)attr[0]->shape->data[1]; + int32_t dst_width = (int32_t)(trans ? attr[1]->shape->data[1] : attr[1]->shape->data[0]); + int32_t dst_height = (int32_t)(trans ? attr[1]->shape->data[2] : attr[1]->shape->data[1]); + int32_t stride = (int32_t)(dst_width * dst_height); int32_t rOffset = 0; int32_t gOffset = 1 * stride; int32_t bOffset = 2 * stride; @@ -234,8 +234,8 @@ DEF_KERNEL_EXECUTOR(_pre_process_rgb_exec) if(trans) { - uint32_t shape[] = {attr[1]->shape->data[0], attr[1]->shape->data[1], attr[1]->shape->data[2], 1}; - uint32_t perm[] = {1, 2, 0, 3}; + vsi_size_t shape[] = {attr[1]->shape->data[0], attr[1]->shape->data[1], attr[1]->shape->data[2], 1}; + vsi_size_t perm[] = {1, 2, 0, 3}; vsi_nn_Transpose((uint8_t*)outBuffer, (uint8_t*)buffer[1], shape, (uint32_t)attr[1]->shape->size, perm, VSI_NN_TYPE_FLOAT32); diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c index 8132778..efd2b60 100644 --- a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv420_cpu.c @@ -126,12 +126,12 @@ DEF_KERNEL_EXECUTOR(_pre_process_yuv420_exec) uint8_t gline1[2], gline2[2]; uint8_t bline1[2], bline2[2]; int32_t dx, dy, dz; - int32_t src_width = attr[0]->shape->data[0]; - int32_t src_height = attr[0]->shape->data[1]; + int32_t src_width = (int32_t)attr[0]->shape->data[0]; + int32_t src_height = (int32_t)attr[0]->shape->data[1]; int32_t subWidth = src_width >> 1; int32_t subHeight = src_height >> 1; - int32_t dst_width = trans ? attr[3]->shape->data[1] : attr[3]->shape->data[0]; - int32_t dst_height = trans ? attr[3]->shape->data[2] : attr[3]->shape->data[1]; + int32_t dst_width = (int32_t)(trans ? attr[3]->shape->data[1] : attr[3]->shape->data[0]); + int32_t dst_height = (int32_t)(trans ? attr[3]->shape->data[2] : attr[3]->shape->data[1]); int32_t stride = dst_width * dst_height; int32_t rOffset = 0; int32_t gOffset = 1 * stride; @@ -281,8 +281,8 @@ DEF_KERNEL_EXECUTOR(_pre_process_yuv420_exec) if(trans) { - uint32_t shape[] = {attr[3]->shape->data[0], attr[3]->shape->data[1], attr[3]->shape->data[2], 1}; - uint32_t perm[] = {1, 2, 0, 3}; + vsi_size_t shape[] = {attr[3]->shape->data[0], attr[3]->shape->data[1], attr[3]->shape->data[2], 1}; + vsi_size_t perm[] = {1, 2, 0, 3}; vsi_nn_Transpose((uint8_t*)outBuffer, (uint8_t*)buffer[3], shape, (uint32_t)attr[3]->shape->size, perm, VSI_NN_TYPE_FLOAT32); diff --git a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c index a19e5ae..c5e8d6e 100644 --- a/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/pre_process_yuv444_cpu.c @@ -126,10 +126,10 @@ DEF_KERNEL_EXECUTOR(_pre_process_yuv444_exec) uint8_t gline1[2], gline2[2]; uint8_t bline1[2], bline2[2]; int32_t dx, dy, dz; - int32_t src_width = attr[0]->shape->data[0]; - int32_t src_height = attr[0]->shape->data[1]; - int32_t dst_width = trans ? attr[3]->shape->data[1] : attr[3]->shape->data[0]; - int32_t dst_height = trans ? attr[3]->shape->data[2] : attr[3]->shape->data[1]; + int32_t src_width = (int32_t)attr[0]->shape->data[0]; + int32_t src_height = (int32_t)attr[0]->shape->data[1]; + int32_t dst_width = (int32_t)(trans ? attr[3]->shape->data[1] : attr[3]->shape->data[0]); + int32_t dst_height = (int32_t)(trans ? attr[3]->shape->data[2] : attr[3]->shape->data[1]); int32_t stride = dst_width * dst_height; int32_t rOffset = 0; int32_t gOffset = 1 * stride; @@ -274,8 +274,8 @@ DEF_KERNEL_EXECUTOR(_pre_process_yuv444_exec) if(trans) { - uint32_t shape[] = {attr[3]->shape->data[0], attr[3]->shape->data[1], attr[3]->shape->data[2], 1}; - uint32_t perm[] = {1, 2, 0, 3}; + vsi_size_t shape[] = {attr[3]->shape->data[0], attr[3]->shape->data[1], attr[3]->shape->data[2], 1}; + vsi_size_t perm[] = {1, 2, 0, 3}; vsi_nn_Transpose((uint8_t*)outBuffer, (uint8_t*)buffer[3], shape, (uint32_t)attr[3]->shape->size, perm, VSI_NN_TYPE_FLOAT32); diff --git a/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c b/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c index b7e97c2..94b64d6 100644 --- a/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/prelu_cpu.c @@ -43,21 +43,21 @@ __BEGIN_DECLS #define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) #define _KERNEL_NAME CVIVANTE_NAMESPACE("prelu_sw") -static int32_t _expand_offset +static vsi_ssize_t _expand_offset ( - int32_t index, - int32_t * shape, size_t rank, - size_t * strides, int32_t * out_shape + vsi_ssize_t index, + vsi_size_t * shape, vsi_size_t rank, + vsi_size_t * strides, vsi_size_t * out_shape ) { - uint32_t i; - int32_t offset = 0; + vsi_size_t i; + vsi_ssize_t offset = 0; for( i = 0; i < rank && index; i ++ ) { if( shape[i] == out_shape[i] ) { - offset += (int32_t)strides[i] * ( index % out_shape[i] ); + offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] ); } index /= out_shape[i]; } @@ -74,8 +74,8 @@ DEF_KERNEL_EXECUTOR(_prelu_exec) vsi_status status = VX_SUCCESS; vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; float * buffer[_CPU_IO_NUM] = { NULL }; - size_t out_elements = 0; - size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_size_t out_elements = 0; + vsi_size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; uint32_t i; @@ -104,14 +104,14 @@ DEF_KERNEL_EXECUTOR(_prelu_exec) for( i = 0; i < out_elements; i ++ ) { - int32_t in0_offset = 0; - int32_t in1_offset = 0; + vsi_ssize_t in0_offset = 0; + vsi_ssize_t in1_offset = 0; float val1 = 0.f; float val2 = 0.f; - in0_offset = _expand_offset( i, attr[0]->shape->data, attr[0]->shape->size, + in0_offset = _expand_offset( i, attr[0]->shape->data, (vsi_size_t)attr[0]->shape->size, stride_size[0], attr[2]->shape->data ); - in1_offset = _expand_offset( i, attr[1]->shape->data, attr[1]->shape->size, + in1_offset = _expand_offset( i, attr[1]->shape->data, (vsi_size_t)attr[1]->shape->size, stride_size[1], attr[2]->shape->data ); val1 = buffer[0][in0_offset]; diff --git a/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c b/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c index 15d1b51..8d3e8b5 100644 --- a/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/random_multinomial_cpu.c @@ -91,8 +91,8 @@ DEF_KERNEL_EXECUTOR(_compute) vsi_status status = VSI_FAILURE; vsi_nn_kernel_tensor_t tensors[_CPU_IO_NUM] = { NULL }; float * buffer[_CPU_IO_NUM] = { NULL }; - size_t out_elements = 0; - size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_size_t out_elements = 0; + vsi_size_t stride_size[_CPU_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; uint32_t *random_integer = NULL; float *random_float = NULL; @@ -114,9 +114,9 @@ DEF_KERNEL_EXECUTOR(_compute) attr[2] = vsi_nn_kernel_tensor_attr_create( tensors[2] ); CHECK_PTR_FAIL_GOTO( attr[2], "Create tensor attr buffer fail.", final ); - sample_num = attr[2]->shape->data[0]; - batch = attr[0]->shape->data[1]; - class_size = attr[0]->shape->data[0]; + sample_num = (int32_t)attr[2]->shape->data[0]; + batch = (int32_t)attr[0]->shape->data[1]; + class_size = (int32_t)attr[0]->shape->data[0]; vsi_nn_kernel_tensor_attr_get_stride( attr[0], stride_size[0] ); vsi_nn_kernel_tensor_attr_get_stride( attr[1], stride_size[1] ); diff --git a/src/tim/vx/internal/src/kernel/cpu/reduceall_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/reduceall_internal_cpu.c index de13bba..a994f5b 100644 --- a/src/tim/vx/internal/src/kernel/cpu/reduceall_internal_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/reduceall_internal_cpu.c @@ -77,16 +77,16 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; uint32_t i; int32_t axis = 0; - int32_t outerSize = 1; - int32_t axisSize = 1; - int32_t innerSize = 1; - int32_t inner = 0; - int32_t outer = 0; + vsi_ssize_t outerSize = 1; + vsi_ssize_t axisSize = 1; + vsi_ssize_t innerSize = 1; + vsi_ssize_t inner = 0; + vsi_ssize_t outer = 0; int32_t all_result = 0; for(i = 0; i < _INPUT_NUM; i ++) diff --git a/src/tim/vx/internal/src/kernel/cpu/reduceany_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/reduceany_internal_cpu.c index 25cfc86..b15437d 100644 --- a/src/tim/vx/internal/src/kernel/cpu/reduceany_internal_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/reduceany_internal_cpu.c @@ -77,16 +77,16 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; uint32_t i; int32_t axis = 0; - int32_t outerSize = 1; - int32_t axisSize = 1; - int32_t innerSize = 1; - int32_t inner = 0; - int32_t outer = 0; + vsi_ssize_t outerSize = 1; + vsi_ssize_t axisSize = 1; + vsi_ssize_t innerSize = 1; + vsi_ssize_t inner = 0; + vsi_ssize_t outer = 0; int32_t any_result = 0; for(i = 0; i < _INPUT_NUM; i ++) diff --git a/src/tim/vx/internal/src/kernel/cpu/reducemax_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/reducemax_internal_cpu.c index 643e126..06479eb 100644 --- a/src/tim/vx/internal/src/kernel/cpu/reducemax_internal_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/reducemax_internal_cpu.c @@ -77,16 +77,16 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; uint32_t i; int32_t axis = 0; - int32_t outerSize = 1; - int32_t axisSize = 1; - int32_t innerSize = 1; - int32_t inner = 0; - int32_t outer = 0; + vsi_ssize_t outerSize = 1; + vsi_ssize_t axisSize = 1; + vsi_ssize_t innerSize = 1; + vsi_ssize_t inner = 0; + vsi_ssize_t outer = 0; float maxValue = 0.0f; for(i = 0; i < _INPUT_NUM; i ++) diff --git a/src/tim/vx/internal/src/kernel/cpu/reducemin_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/reducemin_internal_cpu.c index 8f3728d..22f9b68 100644 --- a/src/tim/vx/internal/src/kernel/cpu/reducemin_internal_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/reducemin_internal_cpu.c @@ -77,16 +77,16 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; uint32_t i; int32_t axis = 0; - int32_t outerSize = 1; - int32_t axisSize = 1; - int32_t innerSize = 1; - int32_t inner = 0; - int32_t outer = 0; + vsi_ssize_t outerSize = 1; + vsi_ssize_t axisSize = 1; + vsi_ssize_t innerSize = 1; + vsi_ssize_t inner = 0; + vsi_ssize_t outer = 0; float minValue = 0.0f; for(i = 0; i < _INPUT_NUM; i ++) diff --git a/src/tim/vx/internal/src/kernel/cpu/reduceprod_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/reduceprod_internal_cpu.c index 3e59616..d5cd781 100644 --- a/src/tim/vx/internal/src/kernel/cpu/reduceprod_internal_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/reduceprod_internal_cpu.c @@ -76,16 +76,16 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; uint32_t i; int32_t axis = 0; - int32_t outerSize = 1; - int32_t axisSize = 1; - int32_t innerSize = 1; - int32_t inner = 0; - int32_t outer = 0; + vsi_ssize_t outerSize = 1; + vsi_ssize_t axisSize = 1; + vsi_ssize_t innerSize = 1; + vsi_ssize_t inner = 0; + vsi_ssize_t outer = 0; float prodValue = 0.0f; for(i = 0; i < _INPUT_NUM; i ++) diff --git a/src/tim/vx/internal/src/kernel/cpu/relu_keras_cpu.c b/src/tim/vx/internal/src/kernel/cpu/relu_keras_cpu.c index ecedffe..ffee85d 100644 --- a/src/tim/vx/internal/src/kernel/cpu/relu_keras_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/relu_keras_cpu.c @@ -81,9 +81,9 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; uint32_t i; float alpha = 0.0f; float max_value = 0.0f; diff --git a/src/tim/vx/internal/src/kernel/cpu/repeat_cpu.c b/src/tim/vx/internal/src/kernel/cpu/repeat_cpu.c index ceb1684..f434ad1 100644 --- a/src/tim/vx/internal/src/kernel/cpu/repeat_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/repeat_cpu.c @@ -61,12 +61,12 @@ DEF_KERNEL_EXECUTOR(_repeat_exec) float * buffer[_CPU_IO_NUM] = { NULL }; size_t out_elements = 0; vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; - int32_t i = 0, j = 0, b = 0, c = 0; + vsi_ssize_t i = 0, j = 0, b = 0, c = 0; int32_t axis = 0; - int32_t outerSize = 1; - int32_t outIdx = 0; - int32_t width = 0, height = 0, channel = 0, batch = 0; - int32_t spatial = 0, vol = 0; + vsi_ssize_t outerSize = 1; + vsi_ssize_t outIdx = 0; + vsi_ssize_t width = 0, height = 0, channel = 0, batch = 0; + vsi_ssize_t spatial = 0, vol = 0; tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; tensors[1] = (vsi_nn_kernel_tensor_t)param[1]; @@ -126,8 +126,8 @@ DEF_KERNEL_EXECUTOR(_repeat_exec) { for(i = 0; i < height; i++) { - int32_t len = (int32_t)buffer[1][i]; - int32_t offset = i * width + c * spatial + b * vol; + vsi_ssize_t len = (int32_t)buffer[1][i]; + vsi_ssize_t offset = i * width + c * spatial + b * vol; for(j = 0; j < len; j++) { memcpy(buffer[2] + outIdx, buffer[0] + offset, sizeof(float) * width); @@ -145,12 +145,12 @@ DEF_KERNEL_EXECUTOR(_repeat_exec) { for(i = 0; i < height; i++) { - int32_t offset = i * width + c * spatial + b * vol; + vsi_ssize_t offset = i * width + c * spatial + b * vol; for(j = 0; j < width; j++) { - int32_t len = (int32_t)buffer[1][j]; + vsi_ssize_t len = (vsi_ssize_t)buffer[1][j]; float data = buffer[0][offset + j]; - int32_t k = 0; + vsi_ssize_t k = 0; for(k = 0; k < len; k++) { buffer[2][outIdx++] = data; @@ -166,8 +166,8 @@ DEF_KERNEL_EXECUTOR(_repeat_exec) { for(c = 0; c < channel; c++) { - int32_t len = (int32_t)buffer[1][c]; - int32_t offset = c * spatial + b * vol; + vsi_ssize_t len = (vsi_ssize_t)buffer[1][c]; + vsi_ssize_t offset = c * spatial + b * vol; for(j = 0; j < len; j++) { diff --git a/src/tim/vx/internal/src/kernel/cpu/resize_1d_bilinear_cpu.c b/src/tim/vx/internal/src/kernel/cpu/resize_1d_bilinear_cpu.c index df91d90..161d2c2 100644 --- a/src/tim/vx/internal/src/kernel/cpu/resize_1d_bilinear_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/resize_1d_bilinear_cpu.c @@ -79,19 +79,19 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; uint32_t i = 0; int32_t align_corners = 0; int32_t half_pixel_centers = 0; float width_scale = 1.0f; - uint32_t input_width = 0, output_width = 0; + vsi_size_t input_width = 0, output_width = 0; uint32_t w = 0, out = 0; - uint32_t output_dims = 0; + vsi_size_t output_dims = 0; float data00 = .0f, data01 = .0f, interpolation = .0f; - uint32_t index = 0; - uint32_t outer = 0; + vsi_size_t index = 0; + vsi_size_t outer = 0; /* prepare data */ for (i = 0; i < _INPUT_NUM; i ++) @@ -118,7 +118,7 @@ DEF_KERNEL_EXECUTOR(_compute) vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_HALF_PIXEL], &(half_pixel_centers)); input_width = in_attr[0]->shape->data[0]; output_width = out_attr[0]->shape->data[0]; - output_dims = (uint32_t)out_attr[0]->shape->size; + output_dims = (vsi_size_t)out_attr[0]->shape->size; if (align_corners && output_width > 1) { @@ -138,13 +138,13 @@ DEF_KERNEL_EXECUTOR(_compute) for (out = 0; out < outer; out++) { - vx_int32 input_base = out * input_width; - vx_int32 output_base = out * output_width; + vsi_ssize_t input_base = out * input_width; + vsi_ssize_t output_base = out * output_width; for (w = 0; w < output_width; w ++) { vx_float32 input_w; - vx_int32 w0; - vx_int32 w1; + vsi_ssize_t w0; + vsi_ssize_t w1; if (half_pixel_centers) { input_w = ((vx_float32)w + 0.5f) * width_scale - 0.5f; @@ -153,8 +153,8 @@ DEF_KERNEL_EXECUTOR(_compute) { input_w = w * width_scale; } - w0 = (vx_int32)input_w; - w1 = input_w < 0 ? 0 : vsi_nn_min(w0 + 1, (vx_int32)(input_width - 1)); + w0 = (vsi_ssize_t)input_w; + w1 = input_w < 0 ? 0 : vsi_nn_min(w0 + 1, (vsi_ssize_t)(input_width - 1)); index = input_base + w0; data00 = f32_in_buffer[0][index]; index = input_base + w1; diff --git a/src/tim/vx/internal/src/kernel/cpu/resize_1d_nearest_cpu.c b/src/tim/vx/internal/src/kernel/cpu/resize_1d_nearest_cpu.c index 44e45a7..3f500bd 100644 --- a/src/tim/vx/internal/src/kernel/cpu/resize_1d_nearest_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/resize_1d_nearest_cpu.c @@ -79,17 +79,17 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; uint32_t i = 0; int32_t align_corners = 0; int32_t half_pixel_centers = 0; float width_scale = 1.0f; - uint32_t input_width = 0, output_width = 0; - uint32_t w = 0, out = 0; - uint32_t output_dims = 0; - uint32_t outer = 0; + vsi_size_t input_width = 0, output_width = 0; + vsi_size_t w = 0, out = 0; + vsi_size_t output_dims = 0; + vsi_size_t outer = 0; /* prepare data */ for (i = 0; i < _INPUT_NUM; i ++) { @@ -135,15 +135,15 @@ DEF_KERNEL_EXECUTOR(_compute) for (out = 0; out < outer; out++) { - vx_int32 input_base = out * input_width; - vx_int32 output_base = out * output_width; + vsi_ssize_t input_base = out * input_width; + vsi_ssize_t output_base = out * output_width; for (w = 0; w < output_width; w ++) { float input_w; - uint32_t in_x; - int32_t in_index; - int32_t out_index; + vsi_size_t in_x; + vsi_ssize_t in_index; + vsi_ssize_t out_index; if (half_pixel_centers) { @@ -155,11 +155,11 @@ DEF_KERNEL_EXECUTOR(_compute) } if (align_corners) { - in_x = vsi_nn_min((uint32_t)simple_round(input_w), input_width - 1); + in_x = vsi_nn_min((vsi_size_t)simple_round(input_w), input_width - 1); } else { - in_x = vsi_nn_min((uint32_t)floorf(input_w), input_width - 1); + in_x = vsi_nn_min((vsi_size_t)floorf(input_w), input_width - 1); } in_index = in_x + input_base; out_index = w + output_base; diff --git a/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c b/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c index f735695..f133568 100644 --- a/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/resize_bilinear_cpu.c @@ -79,23 +79,23 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; uint32_t i; int32_t align_corners; int32_t half_pixel_centers; float width_scale; float height_scale; - uint32_t input_width, output_width, input_height, output_height; - uint32_t b = 0, d = 0, w = 0, h = 0; - uint32_t output_depth, input_depth; - uint32_t output_batch; - uint32_t output_dims, input_dims; + vsi_size_t input_width, output_width, input_height, output_height; + vsi_size_t b = 0, d = 0, w = 0, h = 0; + vsi_size_t output_depth, input_depth; + vsi_size_t output_batch; + vsi_size_t output_dims, input_dims; float data00 = .0f, data01 = .0f, data10 = .0f, data11 = .0f, interpolation = .0f; - uint32_t input_width_orig; - uint32_t output_width_orig; - uint32_t index; + vsi_size_t input_width_orig; + vsi_size_t output_width_orig; + vsi_size_t index; /* prepare data */ for(i = 0; i < _INPUT_NUM; i ++) @@ -124,10 +124,10 @@ DEF_KERNEL_EXECUTOR(_compute) input_height = in_attr[0]->shape->data[1]; output_width = out_attr[0]->shape->data[0]; output_height = out_attr[0]->shape->data[1]; - output_dims = (uint32_t)out_attr[0]->shape->size; + output_dims = (vsi_size_t)out_attr[0]->shape->size; output_depth = output_dims > 2 ? out_attr[0]->shape->data[2] : 1; output_batch = output_dims > 3 ? out_attr[0]->shape->data[3] : 1; - input_dims = (uint32_t)in_attr[0]->shape->size; + input_dims = (vsi_size_t)in_attr[0]->shape->size; input_depth = input_dims > 2 ? in_attr[0]->shape->data[2] : 1; input_width_orig = input_width; output_width_orig = output_width; @@ -154,16 +154,16 @@ DEF_KERNEL_EXECUTOR(_compute) { for (d = 0; d < output_depth; d ++) { - vx_int32 input_base = b * input_depth * input_width_orig * input_height \ + vsi_ssize_t input_base = b * input_depth * input_width_orig * input_height \ + d * input_width_orig * input_height; - vx_int32 output_base = b * output_depth * output_width_orig * output_height \ + vsi_ssize_t output_base = b * output_depth * output_width_orig * output_height \ + d * output_width_orig * output_height; for (h = 0; h < output_height; h ++) { vx_float32 input_h = h * height_scale; - vx_uint32 h0; - vx_uint32 h1; + vsi_size_t h0; + vsi_size_t h1; if (half_pixel_centers) { @@ -173,13 +173,13 @@ DEF_KERNEL_EXECUTOR(_compute) { input_h = h * height_scale; } - h0 = (vx_int32)input_h; + h0 = (vsi_size_t)input_h; h1 = input_h < 0 ? 0 : vsi_nn_min(h0 + 1, input_height - 1); for (w = 0; w < output_width; w ++) { vx_float32 input_w; - vx_int32 w0; - vx_int32 w1; + vsi_ssize_t w0; + vsi_ssize_t w1; if (half_pixel_centers) { input_w = ((vx_float32)w + 0.5f) * width_scale - 0.5f; @@ -188,8 +188,8 @@ DEF_KERNEL_EXECUTOR(_compute) { input_w = w * width_scale; } - w0 = (vx_int32)input_w; - w1 = input_w < 0 ? 0 : vsi_nn_min(w0 + 1, (vx_int32)(input_width - 1)); + w0 = (vsi_ssize_t)input_w; + w1 = input_w < 0 ? 0 : vsi_nn_min(w0 + 1, (vsi_ssize_t)(input_width - 1)); index = input_base + h0 * input_width_orig + w0; data00 = f32_in_buffer[0][index]; index = input_base + h0 * input_width_orig + w1; diff --git a/src/tim/vx/internal/src/kernel/cpu/resize_nearest_cpu.c b/src/tim/vx/internal/src/kernel/cpu/resize_nearest_cpu.c index 7b2aeda..ba0039c 100644 --- a/src/tim/vx/internal/src/kernel/cpu/resize_nearest_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/resize_nearest_cpu.c @@ -79,21 +79,21 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; uint32_t i; int32_t align_corners; int32_t half_pixel_centers; float width_scale; float height_scale; - uint32_t input_width, output_width, input_height, output_height; - uint32_t b = 0, d = 0, w = 0, h = 0; - uint32_t output_depth, input_depth; - uint32_t output_batch; - uint32_t output_dims, input_dims; - uint32_t input_width_orig; - uint32_t output_width_orig; + vsi_size_t input_width, output_width, input_height, output_height; + vsi_size_t b = 0, d = 0, w = 0, h = 0; + vsi_size_t output_depth, input_depth; + vsi_size_t output_batch; + vsi_size_t output_dims, input_dims; + vsi_size_t input_width_orig; + vsi_size_t output_width_orig; /* prepare data */ for(i = 0; i < _INPUT_NUM; i ++) @@ -122,10 +122,10 @@ DEF_KERNEL_EXECUTOR(_compute) input_height = in_attr[0]->shape->data[1]; output_width = out_attr[0]->shape->data[0]; output_height = out_attr[0]->shape->data[1]; - output_dims = (uint32_t)out_attr[0]->shape->size; + output_dims = (vsi_size_t)out_attr[0]->shape->size; output_depth = output_dims > 2 ? out_attr[0]->shape->data[2] : 1; output_batch = output_dims > 3 ? out_attr[0]->shape->data[3] : 1; - input_dims = (uint32_t)in_attr[0]->shape->size; + input_dims = (vsi_size_t)in_attr[0]->shape->size; input_depth = input_dims > 2 ? in_attr[0]->shape->data[2] : 1; input_width_orig = input_width; output_width_orig = output_width; @@ -152,15 +152,15 @@ DEF_KERNEL_EXECUTOR(_compute) { for (d = 0; d < output_depth; d ++) { - int32_t input_base = b * input_depth * input_width_orig * input_height \ + vsi_ssize_t input_base = b * input_depth * input_width_orig * input_height \ + d * input_width_orig * input_height; - int32_t output_base = b * output_depth * output_width_orig * output_height \ + vsi_ssize_t output_base = b * output_depth * output_width_orig * output_height \ + d * output_width_orig * output_height; for (h = 0; h < output_height; h ++) { float input_h; - uint32_t in_y; + vsi_size_t in_y; if (half_pixel_centers) { @@ -172,19 +172,19 @@ DEF_KERNEL_EXECUTOR(_compute) } if (align_corners) { - in_y = vsi_nn_min((uint32_t)simple_round(input_h), input_height - 1); + in_y = vsi_nn_min((vsi_size_t)simple_round(input_h), input_height - 1); } else { - in_y = vsi_nn_min((uint32_t)floorf(input_h), input_height - 1); + in_y = vsi_nn_min((vsi_size_t)floorf(input_h), input_height - 1); } for (w = 0; w < output_width; w ++) { float input_w; - uint32_t in_x; - int32_t in_index; - int32_t out_index; + vsi_size_t in_x; + vsi_ssize_t in_index; + vsi_ssize_t out_index; if (half_pixel_centers) { @@ -196,11 +196,11 @@ DEF_KERNEL_EXECUTOR(_compute) } if (align_corners) { - in_x = vsi_nn_min((uint32_t)simple_round(input_w), input_width - 1); + in_x = vsi_nn_min((vsi_size_t)simple_round(input_w), input_width - 1); } else { - in_x = vsi_nn_min((uint32_t)floorf(input_w), input_width - 1); + in_x = vsi_nn_min((vsi_size_t)floorf(input_w), input_width - 1); } in_index = in_x + in_y * input_width_orig + input_base; out_index = w + h * output_width_orig + output_base; diff --git a/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c b/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c index 2aa18cd..3ec3b56 100644 --- a/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/roi_align_cpu.c @@ -152,9 +152,9 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; uint32_t i = 0; float width_scale = 0.0f; float height_scale = 0.0f; @@ -163,12 +163,12 @@ DEF_KERNEL_EXECUTOR(_compute) int32_t width_sample_num = 0; int32_t height_sample_num = 0; uint32_t n = 0; - uint32_t num_rois = 0; - int32_t inHeight = 0; - int32_t inWidth = 0; - int32_t inDepth = 0; - int32_t outHeight = 0; - int32_t outWidth = 0; + vsi_size_t num_rois = 0; + vsi_ssize_t inHeight = 0; + vsi_ssize_t inWidth = 0; + vsi_ssize_t inDepth = 0; + vsi_ssize_t outHeight = 0; + vsi_ssize_t outWidth = 0; uint32_t kRoiDim = 4; uint32_t out_index = 0; @@ -228,7 +228,7 @@ DEF_KERNEL_EXECUTOR(_compute) float bin_size_x = roi_dims_x / outWidth; float bin_size_y = roi_dims_y / outHeight; - int32_t batch_base_index = batchId * inHeight * inWidth * inDepth; + vsi_ssize_t batch_base_index = batchId * inHeight * inWidth * inDepth; int32_t ch = 0; int32_t py = 0; int32_t px = 0; @@ -255,7 +255,7 @@ DEF_KERNEL_EXECUTOR(_compute) float out_val = 0; out_val = _roi_align_1x1( - input_ptr, inWidth, inHeight, region_start_x, bin_size_x, + input_ptr, (int32_t)inWidth, (int32_t)inHeight, region_start_x, bin_size_x, roi_bin_grid_x, region_end_x, region_start_y, bin_size_y, roi_bin_grid_y, region_end_y); diff --git a/src/tim/vx/internal/src/kernel/cpu/scatter_nd_cpu.c b/src/tim/vx/internal/src/kernel/cpu/scatter_nd_cpu.c index 62c7ff0..b610ac2 100644 --- a/src/tim/vx/internal/src/kernel/cpu/scatter_nd_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/scatter_nd_cpu.c @@ -95,9 +95,9 @@ DEF_KERNEL_EXECUTOR(_scatter_nd_exec) if(coord_dim <= 3) { - int32_t stride[3] = {0, 0, 0}; - int32_t new_shape[3] = {1, 1, 1}; - int32_t merge_dim = (int32_t)attr[2]->shape->size - coord_dim + 1; + vsi_ssize_t stride[3] = {0, 0, 0}; + vsi_ssize_t new_shape[3] = {1, 1, 1}; + vsi_ssize_t merge_dim = (vsi_ssize_t)attr[2]->shape->size - coord_dim + 1; for(i = 0; i < merge_dim; ++i) { @@ -115,7 +115,7 @@ DEF_KERNEL_EXECUTOR(_scatter_nd_exec) for(i = 0; i < indices_num; i++) { uint32_t in_index = i * block_size; - uint32_t out_index = 0; + vsi_size_t out_index = 0; uint32_t coord[3] = {0}; int32_t byd_flg = 0; diff --git a/src/tim/vx/internal/src/kernel/cpu/scatter_nd_update_cpu.c b/src/tim/vx/internal/src/kernel/cpu/scatter_nd_update_cpu.c index d5e7c39..3156df2 100644 --- a/src/tim/vx/internal/src/kernel/cpu/scatter_nd_update_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/scatter_nd_update_cpu.c @@ -107,9 +107,9 @@ DEF_KERNEL_EXECUTOR(_scatter_nd_update_exec) if (coord_dim <= 5) { - int32_t stride[5] = {0, 0, 0, 0, 0}; - int32_t new_shape[5] = {1, 1, 1, 1, 1}; - int32_t merge_dim = (int32_t)attr[3]->shape->size - coord_dim + 1; + vsi_ssize_t stride[5] = {0, 0, 0, 0, 0}; + vsi_ssize_t new_shape[5] = {1, 1, 1, 1, 1}; + vsi_ssize_t merge_dim = (vsi_ssize_t)attr[3]->shape->size - coord_dim + 1; for(i = 0; i < merge_dim; ++i) { @@ -127,10 +127,10 @@ DEF_KERNEL_EXECUTOR(_scatter_nd_update_exec) for(i = 0; i < indices_num; i++) { uint32_t in_index = i * block_size; - uint32_t out_index = 0; + vsi_size_t out_index = 0; uint32_t coord[5] = {0}; int32_t byd_flg = 0; - int32_t mask_idx = 0; + vsi_ssize_t mask_idx = 0; for(j = 0; j < coord_dim; j++) { diff --git a/src/tim/vx/internal/src/kernel/cpu/select_cpu.c b/src/tim/vx/internal/src/kernel/cpu/select_cpu.c index d6804bd..ca4ff58 100644 --- a/src/tim/vx/internal/src/kernel/cpu/select_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/select_cpu.c @@ -58,21 +58,21 @@ static vx_param_description_t _select_kernel_param_def[] = }; #define _SELECT_PARAM_NUM _cnt_of_array( _select_kernel_param_def ) -static int32_t _expand_offset +static vsi_ssize_t _expand_offset ( - int32_t index, - int32_t * shape, size_t rank, - size_t * strides, int32_t * out_shape + vsi_ssize_t index, + vsi_size_t * shape, vsi_size_t rank, + vsi_size_t * strides, vsi_size_t * out_shape ) { - uint32_t i; - int32_t offset = 0; + vsi_size_t i; + vsi_ssize_t offset = 0; for( i = 0; i < rank && index; i ++ ) { if( shape[i] == out_shape[i] ) { - offset += (uint32_t)strides[i] * ( index % out_shape[i] ); + offset += (vsi_ssize_t)strides[i] * ( index % out_shape[i] ); } index /= out_shape[i]; } @@ -96,10 +96,10 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t in_stride_size[_INPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; uint32_t i; /* prepare data */ @@ -126,15 +126,15 @@ DEF_KERNEL_EXECUTOR(_compute) for (i = 0; i < out_elements[0]; i++) { - int32_t in0_offset = 0; - int32_t in1_offset = 0; - int32_t in2_offset = 0; + vsi_ssize_t in0_offset = 0; + vsi_ssize_t in1_offset = 0; + vsi_ssize_t in2_offset = 0; - in0_offset = _expand_offset( i, in_attr[0]->shape->data, in_attr[0]->shape->size, + in0_offset = _expand_offset( i, in_attr[0]->shape->data, (vsi_size_t)in_attr[0]->shape->size, in_stride_size[0], out_attr[0]->shape->data ); - in1_offset = _expand_offset( i, in_attr[1]->shape->data, in_attr[1]->shape->size, + in1_offset = _expand_offset( i, in_attr[1]->shape->data, (vsi_size_t)in_attr[1]->shape->size, in_stride_size[1], out_attr[0]->shape->data ); - in2_offset = _expand_offset( i, in_attr[2]->shape->data, in_attr[2]->shape->size, + in2_offset = _expand_offset( i, in_attr[2]->shape->data, (vsi_size_t)in_attr[2]->shape->size, in_stride_size[2], out_attr[0]->shape->data ); f32_out_buffer[0][i] = (f32_in_buffer[0][in0_offset]) ? diff --git a/src/tim/vx/internal/src/kernel/cpu/sequence_mask_cpu.c b/src/tim/vx/internal/src/kernel/cpu/sequence_mask_cpu.c index 9790537..b4b0e66 100644 --- a/src/tim/vx/internal/src/kernel/cpu/sequence_mask_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/sequence_mask_cpu.c @@ -78,14 +78,14 @@ DEF_KERNEL_EXECUTOR(_sequence_mask_exec) memset( buffer, 0, out_elements * sizeof(float) ); { - uint32_t j = 0; - uint32_t height = attr[1]->shape->data[1]; - uint32_t width = attr[1]->shape->data[0]; + vsi_size_t j = 0; + vsi_size_t height = attr[1]->shape->data[1]; + vsi_size_t width = attr[1]->shape->data[0]; for(j = 0; j < height; j++) { - uint32_t idx_in = (uint32_t)buffer_in[j]; - uint32_t out_offset = j * width; + vsi_size_t idx_in = (vsi_size_t)buffer_in[j]; + vsi_size_t out_offset = j * width; idx_in = idx_in > width ? width : idx_in; for(i = 0; i < idx_in; i++) { @@ -152,12 +152,12 @@ static int32_t _optimize_mask_shape vsi_nn_tensor_t ** inputs, vsi_nn_tensor_t ** outputs, int32_t max_len, - int32_t* opt_shape_in, - int32_t* opt_shape_out + vsi_size_t* opt_shape_in, + vsi_size_t* opt_shape_out ) { vsi_status status = VSI_SUCCESS; - int32_t out_size = 1; + vsi_size_t out_size = 1; uint32_t i = 0; opt_shape_in[0] = 1; opt_shape_in[1] = 1; @@ -197,7 +197,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t backend_params[_CPU_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL; - int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }}; + vsi_size_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }}; int32_t max_len = vsi_nn_kernel_param_get_int32( params, "max_len" ); status = _optimize_mask_shape(inputs, outputs, max_len, new_shape[0], new_shape[1]); diff --git a/src/tim/vx/internal/src/kernel/cpu/signal_frame_cpu.c b/src/tim/vx/internal/src/kernel/cpu/signal_frame_cpu.c index a13aee1..b4379d1 100644 --- a/src/tim/vx/internal/src/kernel/cpu/signal_frame_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/signal_frame_cpu.c @@ -82,9 +82,9 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL}; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; int32_t i = 0; int32_t j = 0; int32_t k = 0; @@ -92,11 +92,11 @@ DEF_KERNEL_EXECUTOR(_compute) int32_t frame_step = 0; int32_t axis = 0; int32_t pad_end = 0; - int32_t length_samples = 0; - int32_t num_frames = 0; - int32_t inner_dim = 1; - int32_t outer_dim = 1; - int32_t inner_size = 1; + vsi_ssize_t length_samples = 0; + vsi_ssize_t num_frames = 0; + vsi_ssize_t inner_dim = 1; + vsi_ssize_t outer_dim = 1; + vsi_ssize_t inner_size = 1; float pad_val = 0; /* prepare data */ diff --git a/src/tim/vx/internal/src/kernel/cpu/slice_cpu.c b/src/tim/vx/internal/src/kernel/cpu/slice_cpu.c index 8307152..9b326dd 100644 --- a/src/tim/vx/internal/src/kernel/cpu/slice_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/slice_cpu.c @@ -77,19 +77,19 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; int32_t rank = 0; int32_t i = 0; - int32_t in_w = 0; - int32_t in_h = 0; - int32_t in_c = 0; - int32_t in_b = 0; - int32_t start[4] = {0}; - int32_t stop[4] = {0}; - int32_t in_size[4] = {1, 1, 1, 1}; - int32_t out_size[4] = {1, 1, 1, 1}; + vsi_ssize_t in_w = 0; + vsi_ssize_t in_h = 0; + vsi_ssize_t in_c = 0; + vsi_ssize_t in_b = 0; + vsi_ssize_t start[4] = {0}; + vsi_ssize_t stop[4] = {0}; + vsi_ssize_t in_size[4] = {1, 1, 1, 1}; + vsi_ssize_t out_size[4] = {1, 1, 1, 1}; float *input_ptr = NULL; float *output_ptr = NULL; int32_t dstIdx = 0; @@ -123,13 +123,13 @@ DEF_KERNEL_EXECUTOR(_compute) out_size[i] = out_attr[0]->shape->data[i]; } - start[0] = (int32_t)f32_in_buffer[1][0]; + start[0] = (vsi_ssize_t)f32_in_buffer[1][0]; stop[0] = start[0] + out_attr[0]->shape->data[0]; - start[1] = rank < 2 ? 0 : (int32_t)f32_in_buffer[1][1]; + start[1] = rank < 2 ? 0 : (vsi_ssize_t)f32_in_buffer[1][1]; stop[1] = rank < 2 ? 1 : start[1] + out_size[1]; - start[2] = rank < 3 ? 0 : (int32_t)f32_in_buffer[1][2]; + start[2] = rank < 3 ? 0 : (vsi_ssize_t)f32_in_buffer[1][2]; stop[2] = rank < 3 ? 1 : start[2] + out_size[2]; - start[3] = rank < 4 ? 0 : (int32_t)f32_in_buffer[1][3]; + start[3] = rank < 4 ? 0 : (vsi_ssize_t)f32_in_buffer[1][3]; stop[3] = rank < 4 ? 1 : start[3] + out_size[3]; input_ptr = f32_in_buffer[0]; output_ptr = f32_out_buffer[0]; @@ -142,7 +142,7 @@ DEF_KERNEL_EXECUTOR(_compute) { for (in_w = start[0]; in_w < stop[0]; ++in_w) { - int32_t srcIdx = ((in_b * in_size[2] + in_c) * in_size[1] + in_h) * in_size[0] + in_w; + vsi_ssize_t srcIdx = ((in_b * in_size[2] + in_c) * in_size[1] + in_h) * in_size[0] + in_w; output_ptr[dstIdx ++] = input_ptr[srcIdx]; } } diff --git a/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c b/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c index a9170c7..1d2770d 100644 --- a/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/space2depth_internal_cpu.c @@ -88,20 +88,20 @@ DEF_KERNEL_EXECUTOR(_space2depth_internal_exec) memset( buffer[1], 0, out_elements * sizeof(float) ); { - uint32_t output_depth = attr[1]->shape->data[2]; - uint32_t output_height = attr[1]->shape->data[1]; - uint32_t output_width = attr[1]->shape->data[0]; - uint32_t input_batch = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1; - uint32_t input_depth = attr[0]->shape->data[2]; - uint32_t input_height = attr[0]->shape->data[1]; - uint32_t input_width = attr[0]->shape->data[0]; - uint32_t batch = 0, in_h = 0, in_w = 0; + vsi_size_t output_depth = attr[1]->shape->data[2]; + vsi_size_t output_height = attr[1]->shape->data[1]; + vsi_size_t output_width = attr[1]->shape->data[0]; + vsi_size_t input_batch = attr[0]->shape->size > 3 ? attr[0]->shape->data[3] : 1; + vsi_size_t input_depth = attr[0]->shape->data[2]; + vsi_size_t input_height = attr[0]->shape->data[1]; + vsi_size_t input_width = attr[0]->shape->data[0]; + vsi_size_t batch = 0, in_h = 0, in_w = 0; for (batch = 0; batch < input_batch; ++ batch) { - uint32_t output_batch_index = batch * output_height * output_width * output_depth; - uint32_t input_batch_index = batch * input_height * input_width * input_depth; - uint32_t in_d = 0; + vsi_size_t output_batch_index = batch * output_height * output_width * output_depth; + vsi_size_t input_batch_index = batch * input_height * input_width * input_depth; + vsi_size_t in_d = 0; for (in_d = 0; in_d < input_depth; in_d ++) { @@ -109,14 +109,14 @@ DEF_KERNEL_EXECUTOR(_space2depth_internal_exec) { for (in_w = 0; in_w < input_width; in_w ++) { - uint32_t out_w = in_w / block_size_x; - uint32_t out_h = in_h / block_size_y; - uint32_t out_d = (in_w % block_size_x) * input_depth + vsi_size_t out_w = in_w / block_size_x; + vsi_size_t out_h = in_h / block_size_y; + vsi_size_t out_d = (in_w % block_size_x) * input_depth + (in_h % block_size_y) * block_size_x * input_depth + in_d; - uint32_t in_index = in_w + in_h * input_width + vsi_size_t in_index = in_w + in_h * input_width + in_d * input_height * input_width + input_batch_index; - uint32_t out_index = out_w + out_h * output_width + vsi_size_t out_index = out_w + out_h * output_width + out_d * output_width * output_height + output_batch_index; buffer[1][out_index] = buffer[0][in_index]; diff --git a/src/tim/vx/internal/src/kernel/cpu/spatial_transformer_cpu.c b/src/tim/vx/internal/src/kernel/cpu/spatial_transformer_cpu.c index 4dd9d59..5d8ed88 100644 --- a/src/tim/vx/internal/src/kernel/cpu/spatial_transformer_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/spatial_transformer_cpu.c @@ -94,8 +94,8 @@ static float _read_pixel(float *base, vsi_nn_kernel_tensor_attr_t *attr, float x, float y, int32_t z, int32_t b) { vsi_bool out_of_bounds = (x < 0 || y < 0 || x >= attr->shape->data[0] || y >= attr->shape->data[1]); - int32_t bx, by; - int32_t offset = (b * attr->shape->data[2] + z) * attr->shape->data[0] * attr->shape->data[1]; + vsi_ssize_t bx, by; + vsi_ssize_t offset = (b * attr->shape->data[2] + z) * attr->shape->data[0] * attr->shape->data[1]; float pixel = 0; if (out_of_bounds) @@ -103,8 +103,8 @@ static float _read_pixel(float *base, vsi_nn_kernel_tensor_attr_t *attr, return 0; } // bounded x/y - bx = (int32_t)x; - by = (int32_t)y; + bx = (vsi_ssize_t)x; + by = (vsi_ssize_t)y; pixel = base[attr->shape->data[0] * by + bx + offset]; @@ -128,9 +128,9 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL}; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; int32_t i = 0; int32_t b = 0; int32_t c = 0; @@ -138,12 +138,12 @@ DEF_KERNEL_EXECUTOR(_compute) int32_t x = 0; int32_t y = 0; int32_t has_theta[6] = {0}; - int32_t batch = 1; - int32_t depth = 1; - int32_t height = 1; - int32_t width = 1; - int32_t input_height = 1; - int32_t input_width = 1; + vsi_ssize_t batch = 1; + vsi_ssize_t depth = 1; + vsi_ssize_t height = 1; + vsi_ssize_t width = 1; + vsi_ssize_t input_height = 1; + vsi_ssize_t input_width = 1; int32_t rank = 0; int32_t index = 0; int32_t align_corners = 0; diff --git a/src/tim/vx/internal/src/kernel/cpu/swish_cpu.c b/src/tim/vx/internal/src/kernel/cpu/swish_cpu.c index 5b09ff7..f17e876 100644 --- a/src/tim/vx/internal/src/kernel/cpu/swish_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/swish_cpu.c @@ -80,9 +80,9 @@ DEF_KERNEL_EXECUTOR(_swish_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; float beta = 1.0f; uint32_t i; @@ -162,8 +162,8 @@ DEF_KERNEL_EXECUTOR(_hswish_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; - size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; uint32_t i; /* prepare data */ diff --git a/src/tim/vx/internal/src/kernel/cpu/tensorstackconcat_cpu.c b/src/tim/vx/internal/src/kernel/cpu/tensorstackconcat_cpu.c index 7ffc7d8..ae98921 100644 --- a/src/tim/vx/internal/src/kernel/cpu/tensorstackconcat_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/tensorstackconcat_cpu.c @@ -78,10 +78,10 @@ DEF_KERNEL_EXECUTOR(_compute) vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; size_t out_elements[_OUTPUT_NUM] = {0}; uint32_t i = 0; - uint32_t depth = 0; - uint32_t height = 1; - uint32_t width = 0; - uint32_t index = 0; + vsi_size_t depth = 0; + vsi_size_t height = 1; + vsi_size_t width = 0; + vsi_size_t index = 0; uint32_t c = 0, y = 0, x = 0; /* prepare data */ @@ -105,7 +105,7 @@ DEF_KERNEL_EXECUTOR(_compute) depth = in_attr[0]->shape->data[2]; height = in_attr[0]->shape->data[1]; width = in_attr[0]->shape->data[0]; - index = (int32_t)f32_in_buffer[1][0]; + index = (vsi_size_t)f32_in_buffer[1][0]; for (c = 0; c < depth; c++) { @@ -113,8 +113,8 @@ DEF_KERNEL_EXECUTOR(_compute) { for (x = 0; x < width; x++) { - int32_t i_idx = c * width * height + y * width + x; - int32_t o_idx = (c * out_attr[0]->shape->data[1] + index ) * out_attr[0]->shape->data[0] + x; + vsi_ssize_t i_idx = c * width * height + y * width + x; + vsi_ssize_t o_idx = (c * out_attr[0]->shape->data[1] + index ) * out_attr[0]->shape->data[0] + x; float value = f32_in_buffer[0][i_idx]; f32_out_buffer[0][o_idx] = value; diff --git a/src/tim/vx/internal/src/kernel/cpu/tile_cpu.c b/src/tim/vx/internal/src/kernel/cpu/tile_cpu.c index 90729c7..343dbe4 100644 --- a/src/tim/vx/internal/src/kernel/cpu/tile_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/tile_cpu.c @@ -43,7 +43,7 @@ __BEGIN_DECLS #define _CPU_PARAM_NUM (_CPU_ARG_NUM + _CPU_IO_NUM) #define _KERNEL_NAME CVIVANTE_NAMESPACE("tile_sw") -void copyMultipleTimes(const float* in_data, int32_t in_size, int32_t multiplier, float* out_data) +void copyMultipleTimes(const float* in_data, vsi_size_t in_size, int32_t multiplier, float* out_data) { int i = 0; @@ -54,13 +54,13 @@ void copyMultipleTimes(const float* in_data, int32_t in_size, int32_t multiplier } } -void tileOneDimension(const vsi_int_array_t* input_shape, const float* in_data, +void tileOneDimension(const vsi_size_array_t* input_shape, const float* in_data, const uint32_t* multipliers, float* out_data, int dimension, - int *stride_size, int *tiled_stride_size) + vsi_size_t *stride_size, vsi_size_t *tiled_stride_size) { - int i = 0; - const int dimension_size = input_shape->data[dimension]; - int total_stride_size = 0, total_tiled_stride_size = 0; + vsi_size_t i = 0; + const vsi_size_t dimension_size = input_shape->data[dimension]; + vsi_ssize_t total_stride_size = 0, total_tiled_stride_size = 0; const float* copy_from_data = in_data; float* copy_to_data = out_data; @@ -103,9 +103,9 @@ DEF_KERNEL_EXECUTOR(_tile_exec) float * buffer[_CPU_IO_NUM] = { NULL }; size_t out_elements = 0; vsi_nn_kernel_tensor_attr_t * attr[_CPU_IO_NUM] = { NULL }; - uint32_t i = 0; + vsi_size_t i = 0; uint32_t multiples[VSI_NN_MAX_DIM_NUM] = {0}; - int stride_size = 0, tiled_stride_size = 0; + vsi_size_t stride_size = 0, tiled_stride_size = 0; tensors[0] = (vsi_nn_kernel_tensor_t)param[0]; @@ -125,7 +125,7 @@ DEF_KERNEL_EXECUTOR(_tile_exec) for (i = 0; i < attr[0]->shape->size; i++) { - multiples[i] = attr[1]->shape->data[i] / attr[0]->shape->data[i]; + multiples[i] = (uint32_t)(attr[1]->shape->data[i] / attr[0]->shape->data[i]); } tileOneDimension(attr[0]->shape, buffer[0], multiples, buffer[1], diff --git a/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c b/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c index a2062c8..78c7752 100644 --- a/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/topk_cpu.c @@ -132,9 +132,9 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; uint32_t i = 0; int32_t j = 0; int32_t top_k = 0; @@ -166,8 +166,8 @@ DEF_KERNEL_EXECUTOR(_compute) status = vsi_nn_kernel_scalar_read_int32( param[3], &top_k ); CHECK_STATUS_FAIL_GOTO(status, final ); - block_num = in_attr[0]->shape->data[1]; - block_size = in_attr[0]->shape->data[0]; + block_num = (uint32_t)in_attr[0]->shape->data[1]; + block_size = (uint32_t)in_attr[0]->shape->data[0]; indices_ptr = (uint32_t*)malloc(block_size * sizeof(uint32_t)); CHECK_PTR_FAIL_GOTO( indices_ptr, "Create indices buffer fail.", final ); diff --git a/src/tim/vx/internal/src/kernel/cpu/upsample_cpu.c b/src/tim/vx/internal/src/kernel/cpu/upsample_cpu.c index a4b8801..2b745ec 100644 --- a/src/tim/vx/internal/src/kernel/cpu/upsample_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/upsample_cpu.c @@ -81,13 +81,13 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM] = {NULL}; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; - int32_t i, j, b, p; - int32_t batch, depth, height, width, height_o, width_o; - int32_t input_base = 0; - int32_t output_base = 0; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_ssize_t i, j, b, p; + vsi_ssize_t batch, depth, height, width, height_o, width_o; + vsi_ssize_t input_base = 0; + vsi_ssize_t output_base = 0; int32_t ksize_x = 0; int32_t ksize_y = 0; vsi_bool is_relative_coord = FALSE; @@ -144,14 +144,14 @@ DEF_KERNEL_EXECUTOR(_compute) { for (i = 0; i < width; i ++) { - int32_t in_index = input_base + j * width + i; + vsi_ssize_t in_index = input_base + j * width + i; float in_value = f32_in_buffer[0][in_index]; - int32_t up_index = (int32_t)f32_in_buffer[1][in_index]; - int32_t out_index = up_index; + vsi_ssize_t up_index = (vsi_ssize_t)f32_in_buffer[1][in_index]; + vsi_ssize_t out_index = up_index; if (is_relative_coord) { - int32_t relative_y = up_index / ksize_x; - int32_t relative_x = up_index % ksize_x; + vsi_ssize_t relative_y = up_index / ksize_x; + vsi_ssize_t relative_x = up_index % ksize_x; out_index = output_base + ((j * ksize_y) + relative_y) * width_o + i * ksize_x + relative_x; } f32_out_buffer[0][out_index] = in_value; diff --git a/src/tim/vx/internal/src/kernel/cpu/upsamplescale_cpu.c b/src/tim/vx/internal/src/kernel/cpu/upsamplescale_cpu.c index e8b49f9..980b7c4 100644 --- a/src/tim/vx/internal/src/kernel/cpu/upsamplescale_cpu.c +++ b/src/tim/vx/internal/src/kernel/cpu/upsamplescale_cpu.c @@ -80,19 +80,19 @@ DEF_KERNEL_EXECUTOR(_compute) float *f32_out_buffer[_OUTPUT_NUM] = {NULL}; vsi_nn_kernel_tensor_attr_t *in_attr[_INPUT_NUM]; vsi_nn_kernel_tensor_attr_t *out_attr[_OUTPUT_NUM]; - size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; - size_t out_elements[_OUTPUT_NUM] = {0}; - size_t out_bytes[_OUTPUT_NUM] = {0}; - int32_t i = 0; + vsi_size_t out_stride_size[_OUTPUT_NUM][VSI_NN_MAX_DIM_NUM] = {{1}}; + vsi_size_t out_elements[_OUTPUT_NUM] = {0}; + vsi_size_t out_bytes[_OUTPUT_NUM] = {0}; + vsi_ssize_t i = 0; int32_t stride = 0; float scale = 0.0f; - int32_t width = 0; - int32_t height = 0; - int32_t out_width = 0; - int32_t out_height = 0; - int32_t outerSize = 1; - int32_t x = 0; - int32_t y = 0; + vsi_ssize_t width = 0; + vsi_ssize_t height = 0; + vsi_ssize_t out_width = 0; + vsi_ssize_t out_height = 0; + vsi_ssize_t outerSize = 1; + vsi_ssize_t x = 0; + vsi_ssize_t y = 0; /* prepare data */ for(i = 0; i < _INPUT_NUM; i ++) @@ -120,7 +120,7 @@ DEF_KERNEL_EXECUTOR(_compute) width = in_attr[0]->shape->data[0]; height = in_attr[0]->shape->data[1]; - for (i = 2; i < (int32_t)in_attr[0]->shape->size; i++) + for (i = 2; i < (vsi_ssize_t)in_attr[0]->shape->size; i++) { outerSize *= in_attr[0]->shape->data[i]; } @@ -134,18 +134,18 @@ DEF_KERNEL_EXECUTOR(_compute) { for (x = 0; x < width; x++) { - int32_t in_idx = i * width * height + y * width + x; - int32_t base_idx = i * out_width * out_height + vsi_ssize_t in_idx = i * width * height + y * width + x; + vsi_ssize_t base_idx = i * out_width * out_height + y * stride * out_width + x * stride; - int32_t dx = 0; - int32_t dy = 0; + vsi_ssize_t dx = 0; + vsi_ssize_t dy = 0; float data = f32_in_buffer[0][in_idx] * scale; for (dy = 0; dy < stride; dy++) { for (dx = 0; dx < stride; dx++) { - int32_t idx = base_idx + dy * out_width + dx; + vsi_ssize_t idx = base_idx + dy * out_width + dx; f32_out_buffer[0][idx] = data; } diff --git a/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c b/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c index b634604..c241e1e 100644 --- a/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/a_times_b_plus_c_evis.c @@ -125,7 +125,7 @@ DEF_KERNEL_INITIALIZER(_a_times_b_plus_c_initializer) uint32_t i = 0; vsi_nn_kernel_tensor_attr_t * attr[4] = { NULL, NULL, NULL, NULL }; - vsi_int_array_t *output_shape = NULL; + vsi_size_array_t *output_shape = NULL; uint32_t pack_key = 0; @@ -322,10 +322,10 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_A_TIMES_B_PLUS_C_PARAM_NUM] = {NULL}; vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; - int32_t* shapes_in[_INPUT_NUM]; - size_t rank_in[_INPUT_NUM]; - int32_t* shapes_ptr[_IO_NUM]; - int32_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + vsi_size_t* shapes_in[_INPUT_NUM]; + vsi_size_t rank_in[_INPUT_NUM]; + vsi_size_t* shapes_ptr[_IO_NUM]; + vsi_size_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; uint32_t new_rank = 0; int32_t i = 0; vsi_bool ret = FALSE; @@ -338,13 +338,13 @@ static vsi_nn_kernel_node_t _setup for (i = 0; i < _INPUT_NUM; i++) { - shapes_in[i] = (int32_t *)inputs[i]->attr.size; - rank_in[i] = (size_t)inputs[i]->attr.dim_num; + shapes_in[i] = inputs[i]->attr.size; + rank_in[i] = (vsi_size_t)inputs[i]->attr.dim_num; } ret = vsi_nn_kernel_optimize_broadcast_shape( - (const int32_t**)shapes_in, (const size_t*)rank_in, _INPUT_NUM, - (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + (const vsi_size_t**)shapes_in, rank_in, _INPUT_NUM, + outputs[0]->attr.size, outputs[0]->attr.dim_num, shapes_ptr, shapes[_INPUT_NUM], &new_rank); if( ret ) @@ -352,13 +352,13 @@ static vsi_nn_kernel_node_t _setup for (i = 0; i < _INPUT_NUM; i++) { reshape_tensors[i] = vsi_nn_reshape_tensor( graph, - inputs[i], (uint32_t*)shapes[i], new_rank ); + inputs[i], shapes[i], new_rank ); } for (i = 0; i < _OUTPUT_NUM; i++) { reshape_tensors[i + _INPUT_NUM] = vsi_nn_reshape_tensor( graph, - outputs[i], (uint32_t*)shapes[i + _INPUT_NUM], new_rank ); + outputs[i], shapes[i + _INPUT_NUM], new_rank ); } } else @@ -366,7 +366,7 @@ static vsi_nn_kernel_node_t _setup return NULL; } - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[_INPUT_NUM]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[_INPUT_NUM]->attr.size, reshape_tensors[_INPUT_NUM]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c b/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c index 97aa183..74dfc35 100644 --- a/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/add_mean_std_norm_evis.c @@ -105,7 +105,7 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer) vsi_nn_kernel_dtype_e input_dtype = F16; vsi_nn_kernel_dtype_e output_dtype = F16; vsi_nn_kernel_tensor_attr_t *input0_attr = NULL, *input1_attr = NULL, *output_attr = NULL; - vsi_int_array_t *input_shape = NULL; + vsi_size_array_t *input_shape = NULL; float scaleIn = 1.0f; int32_t input_ZP = 0; float scaleIn1 = 1.0f; @@ -351,7 +351,7 @@ DEF_KERNEL_INITIALIZER(_add_mean_std_norm_initializer) status |= vsi_nn_kernel_gpu_add_param(node, "inScale1_i16", &inScale_dfp1); CHECK_STATUS_FAIL_GOTO(status, final ); } - width = input_shape->data[0]; + width = (int32_t)input_shape->data[0]; status = vsi_nn_kernel_gpu_add_param(node, "width", &width); status |= vsi_nn_kernel_gpu_add_param(node, "dimRatio", &dimRatio); status |= vsi_nn_kernel_gpu_add_param(node, "rsEps", &rsEps); diff --git a/src/tim/vx/internal/src/kernel/evis/argmax_evis.c b/src/tim/vx/internal/src/kernel/evis/argmax_evis.c index f7ad8f2..a8cec94 100644 --- a/src/tim/vx/internal/src/kernel/evis/argmax_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/argmax_evis.c @@ -162,8 +162,8 @@ DEF_KERNEL_INITIALIZER(_argmax_initializer) int32_t axis = 0; uint32_t argLenSub1 = 0; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL, NULL }; - vsi_int_array_t * input_shape = NULL; - vsi_int_array_t * output_shape = NULL; + vsi_size_array_t * input_shape = NULL; + vsi_size_array_t * output_shape = NULL; uint32_t packedArgIdx[4] = {0}; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); @@ -179,14 +179,14 @@ DEF_KERNEL_INITIALIZER(_argmax_initializer) if (axis == 2 && input_shape->data[2] == 1) { - argLenSub1 = input_shape->data[1] - 1; + argLenSub1 = (uint32_t)(input_shape->data[1] - 1); } else { if (axis == 2) - argLenSub1 = input_shape->data[2] - 1; + argLenSub1 = (uint32_t)(input_shape->data[2] - 1); else if (axis == 1) - argLenSub1 = input_shape->data[1] - 1; + argLenSub1 = (uint32_t)(input_shape->data[1] - 1); } if (axis == 0) @@ -398,9 +398,9 @@ static vsi_nn_kernel_node_t _setup axis = vsi_nn_kernel_param_get_int32(params, "axis"); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) || axis > 2) { diff --git a/src/tim/vx/internal/src/kernel/evis/argmin_evis.c b/src/tim/vx/internal/src/kernel/evis/argmin_evis.c index ae94cfd..bce04ac 100644 --- a/src/tim/vx/internal/src/kernel/evis/argmin_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/argmin_evis.c @@ -162,8 +162,8 @@ DEF_KERNEL_INITIALIZER(_argmin_initializer) int32_t axis = 0; uint32_t argLenSub1 = 0; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL, NULL }; - vsi_int_array_t * input_shape = NULL; - vsi_int_array_t * output_shape = NULL; + vsi_size_array_t * input_shape = NULL; + vsi_size_array_t * output_shape = NULL; uint32_t packedArgIdx[4] = {0}; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); @@ -179,14 +179,14 @@ DEF_KERNEL_INITIALIZER(_argmin_initializer) if (axis == 2 && input_shape->data[2] == 1) { - argLenSub1 = input_shape->data[1] - 1; + argLenSub1 = (uint32_t)(input_shape->data[1] - 1); } else { if (axis == 2) - argLenSub1 = input_shape->data[2] - 1; + argLenSub1 = (uint32_t)(input_shape->data[2] - 1); else if (axis == 1) - argLenSub1 = input_shape->data[1] - 1; + argLenSub1 = (uint32_t)(input_shape->data[1] - 1); } if (axis == 0) @@ -399,9 +399,9 @@ static vsi_nn_kernel_node_t _setup axis = vsi_nn_kernel_param_get_int32(params, "axis"); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) || axis > 2) { diff --git a/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c b/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c index 13b2a6a..a794ee5 100644 --- a/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/batchnorm_single_evis.c @@ -201,7 +201,7 @@ DEF_KERNEL_INITIALIZER(_batch_norm_initializer) vx_tensor output = (vx_tensor)param[BATCHNORM_INPUT_CNT]; vsi_nn_kernel_tensor_attr_t *input_attr = NULL; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; - vsi_int_array_t *output_shape = NULL; + vsi_size_array_t *output_shape = NULL; float input_scale = 1.0f; float input_tail = 0; float output_scale = 1.0f; diff --git a/src/tim/vx/internal/src/kernel/evis/cast_evis.c b/src/tim/vx/internal/src/kernel/evis/cast_evis.c index 4f201e9..f36e100 100644 --- a/src/tim/vx/internal/src/kernel/evis/cast_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/cast_evis.c @@ -148,7 +148,7 @@ DEF_KERNEL_INITIALIZER(_cast_initializer) }; vsi_nn_kernel_tensor_attr_t * output_attr = NULL; vsi_nn_kernel_tensor_attr_t * input_attr = NULL; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( input_attr, "Create tensor attr buffer fail.", final ); @@ -289,7 +289,7 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/clip_evis.c b/src/tim/vx/internal/src/kernel/evis/clip_evis.c index f0c673a..219190a 100644 --- a/src/tim/vx/internal/src/kernel/evis/clip_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/clip_evis.c @@ -131,7 +131,7 @@ DEF_KERNEL_INITIALIZER(_clip_initializer) }; vsi_nn_kernel_tensor_attr_t * output_attr = NULL; vsi_nn_kernel_tensor_attr_t * input_attr = NULL; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; vsi_nn_kernel_dtype_e input_dtype = F16; vsi_nn_kernel_dtype_e output_dtype = F16; float minVal = 1.0f; @@ -579,7 +579,7 @@ static vsi_nn_kernel_node_t _setup float min_value = vsi_nn_kernel_param_get_float32( params, "min_value" ); float max_value = vsi_nn_kernel_param_get_float32( params, "max_value" ); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c index 3c1ac2f..ee5a622 100644 --- a/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/comparisons_evis.c @@ -301,7 +301,7 @@ DEF_KERNEL_INITIALIZER(_comparisons_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL, NULL, NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; float input0Scale = 1.0f; float input0Tail = 0; float input1Scale = 1.0f; @@ -498,7 +498,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; int32_t operation = 0; - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c b/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c index 923328e..a52e76a 100644 --- a/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/conv1d_ovxlib_evis.c @@ -119,9 +119,9 @@ DEF_KERNEL_INITIALIZER(_conv1d_ovxlib_initializer) vsi_nn_kernel_tensor_attr_t * input_attr = NULL; vsi_nn_kernel_tensor_attr_t * weights_attr = NULL; vsi_nn_kernel_tensor_attr_t * output_attr = NULL; - vsi_int_array_t * in_shape = NULL; - vsi_int_array_t * out_shape = NULL; - vsi_int_array_t * weight_shape = NULL; + vsi_size_array_t * in_shape = NULL; + vsi_size_array_t * out_shape = NULL; + vsi_size_array_t * weight_shape = NULL; float scaleIn = 1.0f; float scaleOut = 1.0f; float scaleWights = 1.0f; @@ -169,9 +169,9 @@ DEF_KERNEL_INITIALIZER(_conv1d_ovxlib_initializer) } scaleOut = (scaleIn * scaleWights) / scaleOut; - input_height = in_shape->data[1]; - input_width = in_shape->data[0]; - output_width = out_shape->data[0]; + input_height = (int32_t)(in_shape->data[1]); + input_width = (int32_t)(in_shape->data[0]); + output_width = (int32_t)(out_shape->data[0]); if ((U8 == input_attr->dtype) && (U8 == weights_attr->dtype) && (U8 == output_attr->dtype)) { @@ -389,7 +389,7 @@ DEF_KERNEL_INITIALIZER(_conv1d_ovxlib_initializer) 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 // Constant }, GPU_DP_TYPE_16}; - int32_t kernel_cnt_x16 = (weight_shape->data[0] + 15) / 16; + int32_t kernel_cnt_x16 = (int32_t)((weight_shape->data[0] + 15) / 16); status = vsi_nn_kernel_gpu_add_param( node, "kernel_cnt_x16", &kernel_cnt_x16 ); status |= vsi_nn_kernel_gpu_add_param( node, diff --git a/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c index 7d3dc68..732f949 100644 --- a/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/depth2space_internal_evis.c @@ -206,9 +206,9 @@ DEF_KERNEL_INITIALIZER(_depth2space_crd_initializer) } output_dims = (uint32_t)attr[1]->shape->size; - output_width = attr[1]->shape->data[0]; - output_height = attr[1]->shape->data[1]; - output_chn = output_dims > 2 ? attr[1]->shape->data[2] : 1; + output_width = (int32_t)(attr[1]->shape->data[0]); + output_height = (int32_t)(attr[1]->shape->data[1]); + output_chn = (int32_t)(output_dims > 2 ? attr[1]->shape->data[2] : 1); shaderParam.global_scale[0] = 1; shaderParam.global_scale[1] = 1; @@ -411,7 +411,7 @@ static vsi_nn_kernel_node_t _setup int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); int32_t blk_flg = block_size == 2 ? 1 : 0; - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c index 32b57b4..8888e15 100644 --- a/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/depthwise_conv1d_evis.c @@ -121,7 +121,7 @@ static vx_param_description_t _depthwise_conv1d_kernel_param_def[] = #define _DEPTHWISE_CONV1D_PARAM_NUM _cnt_of_array( _depthwise_conv1d_kernel_param_def ) -static _internal_kernel_size_e get_kernel_size(uint32_t k_size, uint32_t dilation, +static _internal_kernel_size_e get_kernel_size(vsi_size_t k_size, uint32_t dilation, uint32_t stride, uint32_t evis_version) { #define _PACK_SELECT_KEY( kernel_size, dilation, stride, evis_version ) \ @@ -185,7 +185,7 @@ DEF_KERNEL_INITIALIZER(_depthwise_conv1d_initializer) vsi_nn_kernel_tensor_attr_t *input_attr = NULL; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_nn_kernel_tensor_attr_t *weight_attr = NULL; - vsi_int_array_t *output_shape = NULL; + vsi_size_array_t *output_shape = NULL; int32_t weightZP = 0; float outputScale = 1.0f; float outputZP = 0; @@ -213,7 +213,7 @@ DEF_KERNEL_INITIALIZER(_depthwise_conv1d_initializer) CHECK_STATUS_FAIL_GOTO(status, final ); status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &dilation); CHECK_STATUS_FAIL_GOTO(status, final ); - kernel_size = weight_attr->shape->data[0]; + kernel_size = (uint32_t)(weight_attr->shape->data[0]); if(hw_param.evis1 == TRUE && hw_param.evis2 == FALSE) { @@ -704,20 +704,20 @@ static vsi_nn_kernel_node_t _setup vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t node_params[_DEPTHWISE_CONV1D_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; - int32_t weight_pad_front[VSI_NN_MAX_DIM_NUM] = {0}; - int32_t weight_pad_end[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t weight_pad_front[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t weight_pad_end[VSI_NN_MAX_DIM_NUM] = {0}; vsi_nn_tensor_t * weights = NULL; vsi_nn_tensor_t * biases = NULL; vsi_nn_tensor_t *temp_tensor[3] = {NULL}; vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; - int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; int32_t new_rank = 2; uint32_t i = 0; int32_t stride = vsi_nn_kernel_param_get_int32( params, "stride" ); int32_t pad_front = vsi_nn_kernel_param_get_int32( params, "pad_front" ); int32_t pad_end = vsi_nn_kernel_param_get_int32( params, "pad_end" ); int32_t dilation = vsi_nn_kernel_param_get_int32( params, "dilation" ); - int32_t batch = inputs[0]->attr.size[2]; + vsi_size_t batch = inputs[0]->attr.size[2]; _internal_kernel_size_e ks = KN; if ( (!((VSI_NN_TYPE_UINT8 == inputs[0]->attr.dtype.vx_type) @@ -739,7 +739,7 @@ static vsi_nn_kernel_node_t _setup shape[1] *= inputs[1]->attr.size[i]; } reshape_tensors[1] = vsi_nn_reshape_tensor( graph, - inputs[1], (uint32_t*)shape, new_rank ); + inputs[1], shape, new_rank ); } else { @@ -752,7 +752,7 @@ static vsi_nn_kernel_node_t _setup shape[1] = 1; new_rank = 2; reshape_tensors[2] = vsi_nn_reshape_tensor( graph, - inputs[2], (uint32_t*)shape, new_rank ); + inputs[2], shape, new_rank ); } weight_pad_end[0] = gpu_align_np2_safe(reshape_tensors[1]->attr.size[0], 8) - reshape_tensors[1]->attr.size[0]; diff --git a/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c b/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c index 10143c6..ee5faf1 100644 --- a/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/detect_post_box_evis.c @@ -115,7 +115,7 @@ DEF_KERNEL_INITIALIZER(_detect_post_box_initializer) }; vsi_nn_kernel_tensor_attr_t * input_attr = NULL; vsi_nn_kernel_tensor_attr_t * input1_attr = NULL; - vsi_int_array_t * in_shape = NULL; + vsi_size_array_t * in_shape = NULL; float logE = (float)(log10(exp(1.0f)) / log10(2.0f)); float scaleIn0 = 1.0f; float scaleIn1 = 1.0f; diff --git a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c index 1294344..e78d9a9 100644 --- a/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/eltwise_unary_evis.c @@ -362,7 +362,7 @@ DEF_KERNEL_INITIALIZER(_eltwise_unary_initializer) }; int32_t type = 0; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL, NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; float inputScale = 1.0f; float inputTail = 0; float outputScale = 1.0f; @@ -634,23 +634,23 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; vsi_nn_tensor_t* rs_tensors[2] = { NULL }; - int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; - int32_t new_rank = 0; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t new_rank = 0; vsi_bool ret = FALSE; float alpha = vsi_nn_kernel_param_get_float32( params, "alpha" ); ret = vsi_nn_kernel_optimize_element_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank ); if( ret ) { rs_tensors[0] = vsi_nn_reshape_tensor( graph, - inputs[0], (uint32_t*)shape, new_rank ); + inputs[0], shape, new_rank ); rs_tensors[1] = vsi_nn_reshape_tensor( graph, - outputs[0], (uint32_t*)shape, new_rank ); + outputs[0], shape, new_rank ); } - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( rs_tensors[0]->attr.size, rs_tensors[0]->attr.dim_num ) ) { goto OnError; diff --git a/src/tim/vx/internal/src/kernel/evis/erf_evis.c b/src/tim/vx/internal/src/kernel/evis/erf_evis.c index 7753349..a420316 100644 --- a/src/tim/vx/internal/src/kernel/evis/erf_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/erf_evis.c @@ -129,7 +129,7 @@ DEF_KERNEL_INITIALIZER(_erf_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL, NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; float inputScale = 1.0f; float inputTail = 0; float outputScale = 1.0f; @@ -371,23 +371,23 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_ERF_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; vsi_nn_tensor_t* rs_tensors[2] = { NULL }; - int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; - int32_t new_rank = 0; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t new_rank = 0; vsi_bool image_2d = FALSE; vsi_bool ret = FALSE; ret = vsi_nn_kernel_optimize_element_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank ); if ( ret ) { rs_tensors[0] = vsi_nn_reshape_tensor( graph, - inputs[0], (uint32_t*)shape, new_rank ); + inputs[0], shape, new_rank ); rs_tensors[1] = vsi_nn_reshape_tensor( graph, - outputs[0], (uint32_t*)shape, new_rank ); + outputs[0], shape, new_rank ); } - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( rs_tensors[0]->attr.size, rs_tensors[0]->attr.dim_num ) ) { goto OnError; diff --git a/src/tim/vx/internal/src/kernel/evis/extra_ending_evis.c b/src/tim/vx/internal/src/kernel/evis/extra_ending_evis.c index e241d94..627e48b 100644 --- a/src/tim/vx/internal/src/kernel/evis/extra_ending_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/extra_ending_evis.c @@ -95,7 +95,7 @@ DEF_KERNEL_INITIALIZER(_extra_ending_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t * attr = NULL; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); out_shape = attr->shape; @@ -182,26 +182,26 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_EXTRA_ENDING_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; uint32_t rank[3] = {0}; - int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; int32_t i = 0; - vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, + vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, shapes[0], &rank[0]); - vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, + vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, shapes[1], &rank[1]); - vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, + vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, shapes[2], &rank[2]); for (i = 0; i < 2; i++) { reshape_tensors[i] = vsi_nn_reshape_tensor( graph, - inputs[i], (uint32_t*)shapes[i], rank[i] ); + inputs[i], shapes[i], rank[i] ); } reshape_tensors[2] = vsi_nn_reshape_tensor( graph, - outputs[0], (uint32_t*)shapes[2], rank[2] ); + outputs[0], shapes[2], rank[2] ); - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[0]->attr.size, inputs[0]->attr.dim_num ) ) { goto final; diff --git a/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c b/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c index 733b71c..0c7f277 100644 --- a/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/floordiv_evis.c @@ -127,7 +127,7 @@ DEF_KERNEL_INITIALIZER(_floordiv_initializer) vsi_nn_kernel_tensor_attr_t *input0_attr = NULL; vsi_nn_kernel_tensor_attr_t *input1_attr = NULL; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; - vsi_int_array_t *output_shape = NULL; + vsi_size_array_t *output_shape = NULL; vsi_nn_kernel_dtype_e input0_dtype = F16; int32_t input0_fl = 0; int32_t input1_fl = 0; @@ -411,7 +411,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; vsi_bool image_2d = FALSE; - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/gather_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_evis.c index ae35694..d49d92d 100644 --- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c @@ -156,17 +156,17 @@ static vx_param_description_t _gather_kernel_param_def[] = static vsi_status get_gather_tensor_reshape_size ( vsi_nn_tensor_t ** inputs, - int32_t sizes[VSI_NN_MAX_DIM_NUM], - uint32_t block_size, + vsi_size_t sizes[VSI_NN_MAX_DIM_NUM], + vsi_size_t block_size, uint32_t idxFlg, int32_t* arrayFlg ) { vsi_status status = VSI_FAILURE; uint32_t dims_num = inputs[0]->attr.dim_num; - uint32_t *input_size = inputs[0]->attr.size; + vsi_size_t *input_size = inputs[0]->attr.size; uint32_t i = 0; - uint32_t elementCnt = 1; + vsi_size_t elementCnt = 1; #define VSI_NN_MAX_IMAGE_WIDTH (65536) for(i = 0; i < dims_num; ++i) @@ -224,7 +224,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer) uint32_t input_dims1 = 0; vx_uint32 i = 0; vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL}; - vsi_int_array_t * input1_shape = NULL; + vsi_size_array_t * input1_shape = NULL; int32_t src0ZP = 0; float src0Scale = 0; int32_t dstZP = 0; @@ -285,7 +285,7 @@ DEF_KERNEL_INITIALIZER(_gather_initializer) input_dims1 = (uint32_t)input1_shape->size; for (i = 0; i < input_dims1; i++) { - indices_num *= input1_shape->data[i]; + indices_num *= (int32_t)(input1_shape->data[i]); } shaderParam.global_scale[0] = 16; @@ -417,7 +417,7 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer) uint32_t input_dims1 = 0; vx_uint32 i = 0; vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL}; - vsi_int_array_t * input1_shape = NULL; + vsi_size_array_t * input1_shape = NULL; int32_t src0ZP = 0; float src0Scale = 0; int32_t dstZP = 0; @@ -476,7 +476,7 @@ DEF_KERNEL_INITIALIZER(_gather_axis0_initializer) input_dims1 = (uint32_t)input1_shape->size; for (i = 0; i < input_dims1; i++) { - indices_num *= input1_shape->data[i]; + indices_num *= (int32_t)(input1_shape->data[i]); } shaderParam.global_scale[0] = 4; @@ -686,7 +686,7 @@ static vsi_nn_kernel_node_t _setup vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t tmp_params[_GATHER_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; - int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); int32_t block_num = vsi_nn_kernel_param_get_int32( params, "block_num" ); int32_t axis_num = vsi_nn_kernel_param_get_int32( params, "axis_num" ); @@ -714,7 +714,7 @@ static vsi_nn_kernel_node_t _setup return NULL; } - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c index 8595e5a..c206930 100644 --- a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c @@ -125,17 +125,17 @@ static vx_param_description_t _gather_nd_kernel_param_def[] = static vsi_status get_gather_nd_tensor_reshape_size ( vsi_nn_tensor_t ** inputs, - int32_t sizes[VSI_NN_MAX_DIM_NUM], - uint32_t block_size, + vsi_size_t sizes[VSI_NN_MAX_DIM_NUM], + vsi_size_t block_size, uint32_t coordDim, int32_t* newDim ) { vsi_status status = VSI_FAILURE; uint32_t dims_num = inputs[0]->attr.dim_num; - uint32_t *input_size = inputs[0]->attr.size; + vsi_size_t *input_size = inputs[0]->attr.size; uint32_t i = 0; - uint32_t elementCnt = 1; + vsi_size_t elementCnt = 1; #define VSI_NN_MAX_IMAGE_WIDTH (65536) newDim[0] = 0; @@ -263,7 +263,7 @@ DEF_KERNEL_INITIALIZER(_gather_nd_initializer) dstScale = 1; } - indices_num = attr[1]->shape->data[1]; + indices_num = (int32_t)(attr[1]->shape->data[1]); gpu_param.global_scale[0] = 1; gpu_param.global_scale[1] = 1; @@ -450,7 +450,7 @@ static vsi_nn_kernel_node_t _setup vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t tmp_params[_GATHER_ND_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; - int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" ); int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0; @@ -463,7 +463,7 @@ static vsi_nn_kernel_node_t _setup return NULL; } - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c index 89f0c4c..f70df19 100644 --- a/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/group_normalization_evis.c @@ -216,7 +216,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer) {0, 0, 0}}; // globalWorkSize: image size in thread vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; - vsi_int_array_t * input_shape = NULL; + vsi_size_array_t * input_shape = NULL; float scaleIn = 1; int32_t input_zp = 0; vx_uint32 iter = 0; @@ -259,9 +259,9 @@ DEF_KERNEL_INITIALIZER(_groupnorm_sum_sqr_initializer) inFlScale_s2 = in_scale_fl * in_scale_fl; } - width = input_shape->data[0]; - height = input_shape->data[1]; - chn = attr[1]->shape->data[1]; + width = (int32_t)(input_shape->data[0]); + height = (int32_t)(input_shape->data[1]); + chn = (int32_t)(attr[1]->shape->data[1]); if (is2D) { height = 1; @@ -426,8 +426,8 @@ DEF_KERNEL_INITIALIZER(_groupnorm_mean_vari_initializer) attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); - chn = attr[0]->shape->data[1]; - group_stride = attr[0]->shape->data[0]; + chn = (int32_t)(attr[0]->shape->data[1]); + group_stride = (int32_t)(attr[0]->shape->data[0]); shaderParam.global_scale[0] = 4; shaderParam.global_scale[1] = 1; @@ -484,7 +484,7 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer) {0, 0, 0}}; // globalWorkSize: image size in thread vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL}; - vsi_int_array_t * input_shape = NULL; + vsi_size_array_t * input_shape = NULL; float scaleIn = 1.0f; float scaleOut = 1.0f; float reScaleOut_u8 = 1.0f; @@ -550,9 +550,9 @@ DEF_KERNEL_INITIALIZER(_groupnorm_initializer) inOut_fl_scale = in_scale_fl * out_scale_fl; } - width = input_shape->data[0]; - height = input_shape->data[1]; - chn = attr[1]->shape->data[1]; + width = (int32_t)(input_shape->data[0]); + height = (int32_t)(input_shape->data[1]); + chn = (int32_t)(attr[1]->shape->data[1]); if (is2D) { height = 1; @@ -945,15 +945,15 @@ static vsi_status _query_kernel static int32_t _optimize_gn_shape ( vsi_nn_tensor_t ** inputs, - int32_t group_size, + vsi_size_t group_size, int32_t group_num, - int32_t* opt_shape, + vsi_size_t* opt_shape, int32_t* is2D_flg ) { vsi_status status = VSI_SUCCESS; - int32_t group_shape[VSI_NN_MAX_DIM_NUM] = {0}; - int32_t new_rank = 0; + vsi_size_t group_shape[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t new_rank = 0; group_shape[0] = inputs[0]->attr.size[0]; group_shape[1] = inputs[0]->attr.size[1]; group_shape[2] = group_size; @@ -1006,7 +1006,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL }; vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL }; vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL; - int32_t new_shape[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 }; + vsi_size_t new_shape[VSI_NN_MAX_DIM_NUM] = { 1, 1, 1, 1 }; int32_t is2D_flg = 0; uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 }; uint32_t hashkey = 0; @@ -1014,12 +1014,12 @@ static vsi_nn_kernel_node_t _setup float rSpaceOrg = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1]); float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); int32_t group_num = vsi_nn_kernel_param_get_int32( params, "group_num" ); - int32_t group_size = inputs[0]->attr.size[2] / group_num; + vsi_size_t group_size = inputs[0]->attr.size[2] / group_num; float group_ratio = 1.0f / (inputs[0]->attr.size[0] * inputs[0]->attr.size[1] * group_size); // Check if gpu can support the size if ( !vsi_nn_kernel_gpu_check_shape( - (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) + outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; } @@ -1149,7 +1149,7 @@ static vsi_nn_kernel_node_t _setup int32_t pStride = 0; if (!is2D_flg) { - pStride = inputs[1]->attr.size[0] / new_shape[1]; + pStride = (int32_t)(inputs[1]->attr.size[0] / new_shape[1]); rSpaceOrg = 1.0f / (new_shape[0] / pStride); } node_params[index++] = rs_input; diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c index d83d149..9b5a2c1 100644 --- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_evis.c @@ -224,7 +224,7 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_initializer) float tensorZP[4] = {0.0f, 0.0f, 0.0f, 0.0f}; uint32_t i = 0; uint32_t pack_key = 0; - vsi_int_array_t * output_shape = NULL; + vsi_size_array_t * output_shape = NULL; vsi_nn_kernel_tensor_attr_t * attr[4] = { NULL, NULL, NULL, NULL }; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); @@ -375,7 +375,7 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_cdnn_initializer) float input_c_tail = 0; float recur_c_scale = 1.0f; float recur_c_tail = 0; - vsi_int_array_t * output_shape = NULL; + vsi_size_array_t * output_shape = NULL; vsi_nn_kernel_tensor_attr_t * attr[8] = { NULL }; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); @@ -473,13 +473,13 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_cdnn_initializer) if (layer_out == 1 || layer_out == 2) { - input_size = attr[1]->shape->data[0]; - batch = attr[1]->shape->data[1]; + input_size = (int32_t)(attr[1]->shape->data[0]); + batch = (int32_t)(attr[1]->shape->data[1]); } else { - input_size = output_shape->data[0]; - batch = output_shape->data[1]; + input_size = (int32_t)(output_shape->data[0]); + batch = (int32_t)(output_shape->data[1]); } gpu_param.global_scale[0] = 4; @@ -754,8 +754,8 @@ static vsi_nn_kernel_node_t _setup int32_t i = 0; int32_t j = 0; int32_t k = 0; - int32_t input_size = inputs[0]->attr.size[0]; - int32_t batch = inputs[0]->attr.size[1]; + vsi_size_t input_size = inputs[0]->attr.size[0]; + vsi_size_t batch = inputs[0]->attr.size[1]; int32_t param_count = 0; int32_t input_count = 0; int32_t output_count = 0; diff --git a/src/tim/vx/internal/src/kernel/evis/grucell_activation_sma_evis.c b/src/tim/vx/internal/src/kernel/evis/grucell_activation_sma_evis.c index 65b1767..75b6136 100644 --- a/src/tim/vx/internal/src/kernel/evis/grucell_activation_sma_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/grucell_activation_sma_evis.c @@ -126,7 +126,7 @@ DEF_KERNEL_INITIALIZER(_grucell_activation_sma_initializer) uint32_t i = 0; vsi_nn_kernel_tensor_attr_t * attr[4] = { NULL, NULL, NULL, NULL }; - vsi_int_array_t *output_shape = NULL; + vsi_size_array_t *output_shape = NULL; uint32_t pack_key = 0; @@ -293,10 +293,10 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_A_GRUCELL_ACTIVATION_SMA_PARAM_NUM] = {NULL}; vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; - int32_t* shapes_in[_INPUT_NUM]; - size_t rank_in[_INPUT_NUM]; - int32_t* shapes_ptr[_IO_NUM]; - int32_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + vsi_size_t* shapes_in[_INPUT_NUM]; + vsi_size_t rank_in[_INPUT_NUM]; + vsi_size_t* shapes_ptr[_IO_NUM]; + vsi_size_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; uint32_t new_rank = 0; int32_t i = 0; vsi_bool ret = FALSE; @@ -309,13 +309,13 @@ static vsi_nn_kernel_node_t _setup for (i = 0; i < _INPUT_NUM; i++) { - shapes_in[i] = (int32_t *)inputs[i]->attr.size; - rank_in[i] = (size_t)inputs[i]->attr.dim_num; + shapes_in[i] = inputs[i]->attr.size; + rank_in[i] = (vsi_size_t)inputs[i]->attr.dim_num; } ret = vsi_nn_kernel_optimize_broadcast_shape( - (const int32_t**)shapes_in, (const size_t*)rank_in, _INPUT_NUM, - (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + (const vsi_size_t **)shapes_in, rank_in, _INPUT_NUM, + outputs[0]->attr.size, outputs[0]->attr.dim_num, shapes_ptr, shapes[_INPUT_NUM], &new_rank); if( ret ) @@ -323,13 +323,13 @@ static vsi_nn_kernel_node_t _setup for (i = 0; i < _INPUT_NUM; i++) { reshape_tensors[i] = vsi_nn_reshape_tensor( graph, - inputs[i], (uint32_t*)shapes[i], new_rank ); + inputs[i], shapes[i], new_rank ); } for (i = 0; i < _OUTPUT_NUM; i++) { reshape_tensors[i + _INPUT_NUM] = vsi_nn_reshape_tensor( graph, - outputs[i], (uint32_t*)shapes[_INPUT_NUM], new_rank ); + outputs[i], shapes[_INPUT_NUM], new_rank ); } } else @@ -337,7 +337,7 @@ static vsi_nn_kernel_node_t _setup return NULL; } - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[_INPUT_NUM]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[_INPUT_NUM]->attr.size, reshape_tensors[_INPUT_NUM]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c index ecb7014..a01c9f4 100644 --- a/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/instance_normalization_evis.c @@ -219,7 +219,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer) {0, 0, 0}}; // globalWorkSize: image size in thread vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; - vsi_int_array_t * input_shape = NULL; + vsi_size_array_t * input_shape = NULL; float scaleIn = 1; int32_t input_zp = 0; vx_uint32 iter = 0; @@ -262,9 +262,9 @@ DEF_KERNEL_INITIALIZER(_instancenorm_mean_vari_initializer) inFlScale_s2 = in_scale_fl * in_scale_fl; } - width = input_shape->data[0]; - height = input_shape->data[1]; - chn = attr[1]->shape->data[1]; + width = (int32_t)(input_shape->data[0]); + height = (int32_t)(input_shape->data[1]); + chn = (int32_t)(attr[1]->shape->data[1]); if (rsFlg) { height = height / chn; @@ -449,7 +449,7 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) {0, 0, 0}}; // globalWorkSize: image size in thread vsi_nn_kernel_tensor_attr_t* attr[4] = {NULL, NULL}; - vsi_int_array_t * input_shape = NULL; + vsi_size_array_t * input_shape = NULL; float scaleIn = 1.0f; float scaleOut = 1.0f; float scale_inOut = 1.0f; @@ -513,9 +513,9 @@ DEF_KERNEL_INITIALIZER(_instancenorm_initializer) scale_inOut = scaleIn * scaleOut; - width = input_shape->data[0]; - height = input_shape->data[1]; - chn = attr[2]->shape->data[1]; + width = (int32_t)(input_shape->data[0]); + height = (int32_t)(input_shape->data[1]); + chn = (int32_t)(attr[2]->shape->data[1]); if (rsFlg) { height = height / chn; @@ -1000,7 +1000,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL }; vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL }; vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL, rs_gamma = NULL, rs_beta = NULL; - int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 }; uint32_t hashkey = 0; int32_t i = 0; @@ -1009,7 +1009,7 @@ static vsi_nn_kernel_node_t _setup // Check if gpu can support the size if ( !vsi_nn_kernel_gpu_check_shape( - (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) + outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; } diff --git a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c index 095edb1..2943617 100644 --- a/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/l2normalizescale_evis.c @@ -22,7 +22,6 @@ * *****************************************************************************/ - #include #include #include @@ -75,7 +74,6 @@ static const _kernel_map_type _l2normalizescale_kernel_map[] = HASH_L2NORMALIZESCALE_KERNELS_2D( 1, I16, F16, F16 ) }; - /* * Kernel params */ @@ -111,7 +109,7 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer) int32_t axis = 0; vsi_nn_kernel_tensor_attr_t *input_attr = NULL; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; - vsi_int_array_t * output_shape = NULL; + vsi_size_array_t * output_shape = NULL; vsi_nn_kernel_dtype_e input_dtype = F16; vsi_nn_kernel_dtype_e output_dtype = F16; int32_t input_fl = 0; @@ -170,7 +168,6 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer) outputScale = 1.0f / output_attr->asymm.scale; } - r_inputScale = 1.0f / inputScale; if (1 == axis) @@ -303,7 +300,7 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer) if (1 == axis) { - int32_t L2NorS_depth = output_shape->data[1]; + int32_t L2NorS_depth = (int32_t)(output_shape->data[1]); status = vsi_nn_kernel_gpu_add_param( node, "L2NorS_depth", &L2NorS_depth); if(F16 == input_dtype) { @@ -325,7 +322,6 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer) else if(U8 == input_dtype) { status |= vsi_nn_kernel_gpu_add_param( node, "r_inputScale", &r_inputScale); - status |= vsi_nn_kernel_gpu_add_param( node, "inputZP", &inputZP); status |= vsi_nn_kernel_gpu_add_param( node, "uniUInt8SquareLo_4x4", &uniUInt8SquareLo_4x4); status |= vsi_nn_kernel_gpu_add_param( node, "uniUInt8SquareHi_4x4", &uniUInt8SquareHi_4x4); } @@ -334,9 +330,9 @@ DEF_KERNEL_INITIALIZER(_l2normalizescale_initializer) else if (0 == axis) { int32_t inputWidth, inputWidthCount, inputWidthRemain256; - inputWidth = output_shape->data[0]; - inputWidthRemain256 = output_shape->data[0] % 256; - inputWidthCount = output_shape->data[0] / 256; + inputWidth = (int32_t)(output_shape->data[0]); + inputWidthRemain256 = (int32_t)(output_shape->data[0] % 256); + inputWidthCount = (int32_t)(output_shape->data[0] / 256); vsi_nn_kernel_gpu_add_param( node, "inputWidth", &inputWidth); vsi_nn_kernel_gpu_add_param( node, "inputWidthRemain256", &inputWidthRemain256); vsi_nn_kernel_gpu_add_param( node, "inputWidthCount", &inputWidthCount); @@ -448,11 +444,8 @@ final: if (output_attr) vsi_nn_kernel_tensor_attr_release( &output_attr ); return status; - } /* _l2normalizescale_initializer() */ - - /* * Query kernel */ @@ -509,7 +502,6 @@ static vsi_status _query_kernel return status; } /* _query_kernel() */ - static vsi_nn_kernel_node_t _setup ( vsi_nn_graph_t * graph, @@ -529,9 +521,9 @@ static vsi_nn_kernel_node_t _setup axis = vsi_nn_kernel_param_get_int32(params, "axis"); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) || axis > 2) { @@ -573,4 +565,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_EVIS( l2normalizescale, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c index d6c4b8a..1de96db 100644 --- a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c @@ -22,7 +22,6 @@ * *****************************************************************************/ - #include #include #include @@ -64,7 +63,6 @@ __BEGIN_DECLS #define KERNEL_SOURCE_9 "layer_normalization_scale_f32_2d" #define KERNEL_SOURCE_10 "layer_normalization_scale_f32_bf16" - #define HASH_LAYERNORM_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE) \ CVIVANTE_NAMESPACE("evis.layer_norm_"#SRC0_TYPE"to"#DST_TYPE) @@ -81,13 +79,13 @@ __BEGIN_DECLS #define HASH_LAYERNORM_KEY(_input0_type, _input2_type, _output_type, _reshape_flag) \ ((_input0_type << 24) | (_input2_type << 16) | (_output_type << 8) | _reshape_flag) -#define TENSOR_LAYERNORM_KERNELS(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, LAYERNORM_KERNEL), \ +#define TENSOR_LAYERNORM_KERNELS(IN0_TYPE, SCALE_TYPE, OUT_TYPE, SOURCE) \ + { HASH_LAYERNORM_KEY(IN0_TYPE, SCALE_TYPE, OUT_TYPE, LAYERNORM_KERNEL), \ HASH_LAYERNORM_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE), \ SOURCE }, -#define TENSOR_LAYERNORM_KERNELS_2D(IN0_TYPE, OUT_TYPE, SOURCE) \ - { HASH_LAYERNORM_KEY(IN0_TYPE, F16, OUT_TYPE, LAYERNORM_2D_KERNEL), \ +#define TENSOR_LAYERNORM_KERNELS_2D(IN0_TYPE, SCALE_TYPE, OUT_TYPE, SOURCE) \ + { HASH_LAYERNORM_KEY(IN0_TYPE, SCALE_TYPE, OUT_TYPE, LAYERNORM_2D_KERNEL), \ HASH_LAYERNORM_SH_KERNEL_2D_NAME(IN0_TYPE, OUT_TYPE), \ SOURCE }, @@ -144,17 +142,19 @@ typedef struct static const _kernel_map_type _layernorm_kernel_map[] = { // Register kernel here - TENSOR_LAYERNORM_KERNELS( U8, U8, KERNEL_SOURCE_1 ) - TENSOR_LAYERNORM_KERNELS_2D( U8, U8, KERNEL_SOURCE_2 ) - TENSOR_LAYERNORM_KERNELS( U8, F16, KERNEL_SOURCE_3 ) - TENSOR_LAYERNORM_KERNELS_2D( U8, F16, KERNEL_SOURCE_3 ) + TENSOR_LAYERNORM_KERNELS( U8, F16, U8, KERNEL_SOURCE_1 ) + TENSOR_LAYERNORM_KERNELS_2D( U8, F16, U8, KERNEL_SOURCE_2 ) + TENSOR_LAYERNORM_KERNELS( U8, F16, F16, KERNEL_SOURCE_3 ) + TENSOR_LAYERNORM_KERNELS_2D( U8, F16, F16, KERNEL_SOURCE_3 ) + TENSOR_LAYERNORM_KERNELS( U8, F32, F16, KERNEL_SOURCE_3 ) + TENSOR_LAYERNORM_KERNELS_2D( U8, F32, F16, KERNEL_SOURCE_3 ) - TENSOR_LAYERNORM_KERNELS( F16, F16, KERNEL_SOURCE_1 ) - TENSOR_LAYERNORM_KERNELS_2D( F16, F16, KERNEL_SOURCE_2 ) - TENSOR_LAYERNORM_KERNELS( F16, U8, KERNEL_SOURCE_1 ) - TENSOR_LAYERNORM_KERNELS_2D( F16, U8, KERNEL_SOURCE_2 ) - TENSOR_LAYERNORM_KERNELS( I16, I16, KERNEL_SOURCE_6 ) - TENSOR_LAYERNORM_KERNELS_2D( I16, I16, KERNEL_SOURCE_6 ) + TENSOR_LAYERNORM_KERNELS( F16, F16, F16, KERNEL_SOURCE_1 ) + TENSOR_LAYERNORM_KERNELS_2D( F16, F16, F16, KERNEL_SOURCE_2 ) + TENSOR_LAYERNORM_KERNELS( F16, F16, U8, KERNEL_SOURCE_1 ) + TENSOR_LAYERNORM_KERNELS_2D( F16, F16, U8, KERNEL_SOURCE_2 ) + TENSOR_LAYERNORM_KERNELS( I16, F16, I16, KERNEL_SOURCE_6 ) + TENSOR_LAYERNORM_KERNELS_2D( I16, F16, I16, KERNEL_SOURCE_6 ) TENSOR_LAYERNORM_SCALE_KERNELS( U8, U8, KERNEL_SOURCE_8 ) TENSOR_LAYERNORM_SCALE_KERNELS_2D( U8, U8, KERNEL_SOURCE_9 ) @@ -244,7 +244,7 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer) {0, 0, 0}}; // globalWorkSize: image size in thread vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL}; - vsi_int_array_t * input_shape = NULL; + vsi_size_array_t * input_shape = NULL; float scaleIn = 1; float scaleOut = 1; float output_zp = 0; @@ -311,9 +311,9 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer) output_zp = 0.0f; } - width = input_shape->data[0]; - height = input_shape->data[1]; - chn = (input_shape->size <= 2) ? 1 : input_shape->data[2]; + width = (int32_t)(input_shape->data[0]); + height = (int32_t)(input_shape->data[1]); + chn = (int32_t)((input_shape->size <= 2) ? 1 : input_shape->data[2]); iter = ((width + 15) / 16) * 16; sumInZp = input_zp * iter * (-1); @@ -497,6 +497,7 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer) switch( pack_key ) { case _PACK_SELECT_KEY( U8, F16, F16 ): + case _PACK_SELECT_KEY( U8, F32, F16 ): { status = vsi_nn_kernel_gpu_add_param(node, "UniPackFP16even_2x8", &UniPackFP16even_2x8); @@ -510,10 +511,6 @@ DEF_KERNEL_INITIALIZER(_layernorm_initializer) &uniConvert3rdUint8SubZpToFp32_4x4); status |= vsi_nn_kernel_gpu_add_param(node, "uniConvert4thUint8SubZpToFp32_4x4", &uniConvert4thUint8SubZpToFp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "uniConvertSecFp16Fp32_4x4", - &uniConvertSecFp16Fp32_4x4); - status |= vsi_nn_kernel_gpu_add_param(node, "UniFP16toFP32Lo4_dp4x4", - &UniFP16toFP32Lo4_dp4x4); status |= vsi_nn_kernel_gpu_add_param(node, "e2InScale", &e2InScale); status |= vsi_nn_kernel_gpu_add_param(node, "inputZP", &input_zp); status |= vsi_nn_kernel_gpu_add_param(node, "input_scale", &scaleIn); @@ -688,7 +685,7 @@ DEF_KERNEL_INITIALIZER(_sumsqr_initializer) {0, 0, 0}}; // globalWorkSize: image size in thread vsi_nn_kernel_tensor_attr_t* attr[2] = {NULL, NULL}; - vsi_int_array_t * input_shape = NULL; + vsi_size_array_t * input_shape = NULL; float scaleIn = 1.0f; int32_t input_zp = 0; vx_uint32 iter = 0; @@ -726,9 +723,9 @@ DEF_KERNEL_INITIALIZER(_sumsqr_initializer) input_zp = 0; } - width = input_shape->data[0]; - height = input_shape->data[1]; - chn = attr[1]->shape->data[1]; + width = (int32_t)(input_shape->data[0]); + height = (int32_t)(input_shape->data[1]); + chn = (int32_t)(attr[1]->shape->data[1]); iter = height * 16; e2InScale = scaleIn * scaleIn; @@ -856,7 +853,7 @@ DEF_KERNEL_INITIALIZER(_layernorm_wh_initializer) {0, 0, 0}}; // globalWorkSize: image size in thread vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL}; - vsi_int_array_t * input_shape = NULL; + vsi_size_array_t * input_shape = NULL; float scaleIn = 1.0f; float scaleOut = 1.0f; float output_zp = 0; @@ -910,10 +907,10 @@ DEF_KERNEL_INITIALIZER(_layernorm_wh_initializer) output_zp = 0; } - width = input_shape->data[0]; - height = input_shape->data[1]; - chn = attr[1]->shape->data[1]; - height_chn_org = (input_shape->size > 2 ? input_shape->data[2] : 1) / chn; + width = (int32_t)(input_shape->data[0]); + height = (int32_t)(input_shape->data[1]); + chn = (int32_t)(attr[1]->shape->data[1]); + height_chn_org = (int32_t)((input_shape->size > 2 ? input_shape->data[2] : 1) / chn); dimRatio = (float)(1.0 / (width * height)); @@ -1169,7 +1166,6 @@ static vsi_status _query_kernel_wh _sumsqr_kernel_map[i].source_name ); } - key = HASH_LAYERNORM_KEY( input0_dtype, input2_dtype, output_dtype, is2D_wh ); for( i = 0; i < _cnt_of_array(_sumsqr_kernel_map); i ++ ) @@ -1223,10 +1219,10 @@ static vsi_nn_kernel_node_t _setup_wh int32_t axis[VSI_NN_MAX_DIM_NUM] = {0}; int32_t axis_num = 1; int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0}; - int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }}; + vsi_size_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }}; uint32_t axis_size = 0; uint32_t rank_in = 0, rank_para = 0; - uint32_t outer_size = 1; + vsi_size_t outer_size = 1; uint32_t i = 0; for(i = 1; i < inputs[0]->attr.dim_num; i++) @@ -1235,7 +1231,7 @@ static vsi_nn_kernel_node_t _setup_wh } status = vsi_nn_kernel_optimize_tensor_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, axis_num, new_shape[0], &rank_in, new_axis, &axis_size); if ( status == FALSE || axis_size > 2) { @@ -1243,7 +1239,7 @@ static vsi_nn_kernel_node_t _setup_wh } status = vsi_nn_kernel_optimize_tensor_shape( - (int32_t *)inputs[1]->attr.size, inputs[1]->attr.dim_num, + inputs[1]->attr.size, inputs[1]->attr.dim_num, axis, axis_num, new_shape[1], &rank_para, new_axis, &axis_size); if ( status == FALSE || axis_size > 2) { @@ -1379,7 +1375,6 @@ final: return node; } - static vsi_nn_kernel_node_t _setup ( vsi_nn_graph_t * graph, @@ -1396,7 +1391,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL, rs_gamma = NULL, rs_beta = NULL; float eps = vsi_nn_kernel_param_get_float32( params, "eps" ); - uint32_t *input_size = inputs[0]->attr.size; + vsi_size_t *input_size = inputs[0]->attr.size; uint32_t dims_num = inputs[0]->attr.dim_num; int32_t rs_flg = 0; int32_t optFlg = 0; @@ -1422,7 +1417,7 @@ static vsi_nn_kernel_node_t _setup if (rs_flg) { - int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; shape[0] = inputs[0]->attr.size[0]; shape[1] = inputs[0]->attr.size[1] * inputs[0]->attr.size[2]; shape[2] = 1; @@ -1437,7 +1432,7 @@ static vsi_nn_kernel_node_t _setup } if (inputs[1]->attr.dim_num < 2) { - int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; shape[0] = inputs[1]->attr.size[0]; shape[1] = 1; shape[2] = 1; @@ -1446,7 +1441,7 @@ static vsi_nn_kernel_node_t _setup } if (inputs[2]->attr.dim_num < 2) { - int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; shape[0] = inputs[2]->attr.size[0]; shape[1] = 1; shape[2] = 1; @@ -1533,4 +1528,3 @@ final: __END_DECLS REGISTER_BACKEND_EVIS( layer_norm, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c b/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c index bfe4a96..3ee3028 100644 --- a/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c @@ -161,7 +161,7 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer) uint32_t inputWidth = 0; uint32_t inputWidthRemain4 = 0; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL, NULL }; - vsi_int_array_t * output_shape = NULL; + vsi_size_array_t * output_shape = NULL; float logE = (float)(log10(exp(1.0f)) / log10(2.0f)); float rlogE = (float)(log10(2.0f) / log10(exp(1.0f))); float scaleLogE = 0; @@ -324,8 +324,8 @@ DEF_KERNEL_INITIALIZER(_log_softmax_initializer) { case 0: { - inputWidth = output_shape->data[axis] / 4 * 4; - inputWidthRemain4 = output_shape->data[axis] % 4; + inputWidth = (uint32_t)(output_shape->data[axis] / 4 * 4); + inputWidthRemain4 = (uint32_t)(output_shape->data[axis] % 4); status = vsi_nn_kernel_gpu_add_param( node, "inputWidth", &inputWidth ); @@ -532,7 +532,7 @@ static vsi_nn_kernel_node_t _setup axis = vsi_nn_kernel_param_get_int32(params, "axis"); beta = vsi_nn_kernel_param_get_float32(params, "beta"); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) || axis > 2) { diff --git a/src/tim/vx/internal/src/kernel/evis/logical_not_evis.c b/src/tim/vx/internal/src/kernel/evis/logical_not_evis.c index f2f915f..69c0434 100644 --- a/src/tim/vx/internal/src/kernel/evis/logical_not_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/logical_not_evis.c @@ -110,7 +110,7 @@ DEF_KERNEL_INITIALIZER(_logical_not_initializer) vx_tensor output = (vx_tensor)param[1]; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; - vsi_int_array_t *output_shape = NULL; + vsi_size_array_t *output_shape = NULL; output_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)output); CHECK_PTR_FAIL_GOTO( output_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); @@ -222,7 +222,7 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c b/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c index 38e9df3..ed9561c 100644 --- a/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/logical_ops_evis.c @@ -117,7 +117,7 @@ DEF_KERNEL_INITIALIZER(_logical_ops_initializer) vx_tensor output = (vx_tensor)param[2]; vsi_nn_kernel_dtype_e input_dtype = F16; vsi_nn_kernel_tensor_attr_t *input_attr = NULL, *output_attr = NULL; - vsi_int_array_t *output_shape = NULL; + vsi_size_array_t *output_shape = NULL; input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input); CHECK_PTR_FAIL_GOTO( input_attr, "vsi_nn_kernel_tensor_attr_create fail.", final ); @@ -262,7 +262,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; uint32_t ops_type = vsi_nn_kernel_param_get_int32( params, "ops_type" ); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c index 53fce38..3a1eb37 100644 --- a/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/matrixmul_evis.c @@ -298,9 +298,9 @@ DEF_KERNEL_INITIALIZER(_matrix_mul_initializer) ac2zero = 1; } - width = attr[2]->shape->data[0]; - height = attr[2]->shape->data[1]; - chn = attr[2]->shape->size > 2 ? attr[2]->shape->data[2] : 1; + width = (int32_t)(attr[2]->shape->data[0]); + height = (int32_t)(attr[2]->shape->data[1]); + chn = (int32_t)(attr[2]->shape->size > 2 ? attr[2]->shape->data[2] : 1); gpu_param.global_scale[0] = 4; gpu_param.global_scale[1] = 4; @@ -1048,10 +1048,10 @@ static vsi_nn_kernel_node_t _setup int32_t transposeB = vsi_nn_kernel_param_get_int32( params, "transposeB" ); int32_t adjointA = vsi_nn_kernel_param_get_int32( params, "adjointA" ); int32_t adjointB = vsi_nn_kernel_param_get_int32( params, "adjointB" ); - uint32_t M = inputs[0]->attr.size[1]; - uint32_t K = inputs[0]->attr.size[0]; - uint32_t N = inputs[1]->attr.size[0]; - uint32_t depthA = 1, depthB = 1; + vsi_size_t M = inputs[0]->attr.size[1]; + vsi_size_t K = inputs[0]->attr.size[0]; + vsi_size_t N = inputs[1]->attr.size[0]; + vsi_size_t depthA = 1, depthB = 1; if ((inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 && inputs[1]->attr.dtype.vx_type == VSI_NN_TYPE_FLOAT32 @@ -1061,7 +1061,7 @@ static vsi_nn_kernel_node_t _setup return NULL; } - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; @@ -1081,7 +1081,7 @@ static vsi_nn_kernel_node_t _setup depthB = inputs[1]->attr.dim_num > 2 ? inputs[1]->attr.size[2] : 1; if (M == 1 && depthB == 1 && depthA > 1) { - int32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; shape[0] = inputs[0]->attr.size[0]; shape[1] = inputs[0]->attr.size[2]; shape[2] = 1; @@ -1125,11 +1125,11 @@ static vsi_nn_kernel_node_t _setup CHECK_STATUS(status); vsi_nn_kernel_scalar_release( &tmp_params[3] ); vsi_nn_kernel_scalar_release( &tmp_params[4] ); - vsi_nn_kernel_tensor_release( &tmp_params[5] ); - vsi_nn_kernel_tensor_release( &tmp_params[6] ); - vsi_nn_kernel_tensor_release( &tmp_params[7] ); - vsi_nn_kernel_tensor_release( &tmp_params[8] ); - vsi_nn_kernel_tensor_release( &tmp_params[9] ); + vsi_nn_kernel_scalar_release( &tmp_params[5] ); + vsi_nn_kernel_scalar_release( &tmp_params[6] ); + vsi_nn_kernel_scalar_release( &tmp_params[7] ); + vsi_nn_kernel_scalar_release( &tmp_params[8] ); + vsi_nn_kernel_scalar_release( &tmp_params[9] ); { // Set default border mode. vx_border_t border; @@ -1166,4 +1166,3 @@ static vsi_nn_kernel_node_t _setup __END_DECLS REGISTER_BACKEND_EVIS( matrixmul, _setup ) - diff --git a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c index 68dc6e8..3c76c65 100644 --- a/src/tim/vx/internal/src/kernel/evis/maximum_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/maximum_evis.c @@ -177,7 +177,7 @@ DEF_KERNEL_INITIALIZER(_maximum_initializer) int32_t shift1 = 0; vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; uint32_t pack_key; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); @@ -723,7 +723,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_type_e dtype1 = inputs[0]->attr.dtype.vx_type; vsi_nn_type_e dtype2 = inputs[1]->attr.dtype.vx_type; - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c index 02d7523..16be973 100644 --- a/src/tim/vx/internal/src/kernel/evis/minimum_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/minimum_evis.c @@ -177,7 +177,7 @@ DEF_KERNEL_INITIALIZER(_minimum_initializer) int32_t shift1 = 0; vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; uint32_t pack_key; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); @@ -723,7 +723,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_type_e dtype1 = inputs[0]->attr.dtype.vx_type; vsi_nn_type_e dtype2 = inputs[1]->attr.dtype.vx_type; - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/moments_evis.c b/src/tim/vx/internal/src/kernel/evis/moments_evis.c index 01998f3..2379574 100644 --- a/src/tim/vx/internal/src/kernel/evis/moments_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/moments_evis.c @@ -190,7 +190,7 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) {0, 0, 0}}; // globalWorkSize: image size in thread vsi_nn_kernel_tensor_attr_t* attr[3] = {NULL, NULL, NULL}; - vsi_int_array_t * input_shape = NULL; + vsi_size_array_t * input_shape = NULL; float scaleIn = 0; int32_t input_zp = 0; vx_uint32 iter = 0; @@ -313,9 +313,9 @@ DEF_KERNEL_INITIALIZER(_moments_initializer) iterSize = 8; } - width = input_shape->data[0]; - height = input_shape->size > 1 ? input_shape->data[1] : 1; - chn = input_shape->size > 2 ? input_shape->data[2] : 1; + width = (int32_t)(input_shape->data[0]); + height = (int32_t)(input_shape->size > 1 ? input_shape->data[1] : 1); + chn = (int32_t)(input_shape->size > 2 ? input_shape->data[2] : 1); shaderParam.global_scale[0] = 1; shaderParam.global_scale[1] = 1; @@ -753,7 +753,7 @@ static vsi_nn_kernel_node_t _setup size_t axis_num_temp = 0; int32_t* axis = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "axis", &axis_num_temp); int32_t axis_first = axis[0]; - int32_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 1, 1, 1, 1 } }; + vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 1, 1, 1, 1 } }; vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; int32_t new_axis[VSI_NN_MAX_DIM_NUM] = {0}; @@ -782,9 +782,9 @@ static vsi_nn_kernel_node_t _setup } ret = vsi_nn_kernel_optimize_reduce_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, axis_num, - (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + outputs[0]->attr.size, outputs[0]->attr.dim_num, shapes[0], &rank_in, shapes[1], &rank_out, new_axis, &axis_size); @@ -794,13 +794,13 @@ static vsi_nn_kernel_node_t _setup } reshape_tensors[0] = vsi_nn_reshape_tensor( graph, - inputs[0], (uint32_t*)shapes[0], rank_in ); + inputs[0], shapes[0], rank_in ); reshape_tensors[1] = vsi_nn_reshape_tensor( graph, - outputs[0], (uint32_t*)shapes[1], rank_out ); + outputs[0], shapes[1], rank_out ); reshape_tensors[2] = vsi_nn_reshape_tensor( graph, - outputs[1], (uint32_t*)shapes[1], rank_out ); + outputs[1], shapes[1], rank_out ); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[1]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[1]->attr.size, reshape_tensors[1]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c index f1798c2..60de16a 100644 --- a/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/one_hot_evis.c @@ -128,7 +128,7 @@ DEF_KERNEL_INITIALIZER(_one_hot_initializer) vsi_status status = VSI_FAILURE; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; - vsi_int_array_t * in_shape = NULL; + vsi_size_array_t * in_shape = NULL; int32_t suffix_size = 0; int32_t depth = 0; int32_t input_zp = 0; @@ -144,7 +144,7 @@ DEF_KERNEL_INITIALIZER(_one_hot_initializer) vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[SCALAR_INPUT_SUFFIX_SIZE], &(suffix_size)); in_shape = attr[0]->shape; - depth = attr[1]->shape->data[1]; + depth = (int32_t)(attr[1]->shape->data[1]); input_dtype = attr[0]->dtype; if (VSI_NN_KERNEL_QUANT_DFP == attr[0]->quant) @@ -162,7 +162,7 @@ DEF_KERNEL_INITIALIZER(_one_hot_initializer) gpu_param.global_scale[0] = 4; gpu_param.global_scale[1] = 1; - depth = attr[1]->shape->data[0]; + depth = (int32_t)(attr[1]->shape->data[0]); } else { @@ -358,12 +358,12 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t node_params[_ONE_HOT_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; vsi_nn_tensor_t* rs_tensors[2] = { NULL }; - int32_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }}; + vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }}; int32_t i = 0; vsi_bool image_2d = FALSE; - int32_t num_elements = vsi_nn_vxGetTensorElementNum(&inputs[0]->attr); - int32_t prefix_dim_size = 1; - int32_t suffix_dim_size = 0; + vsi_size_t num_elements = vsi_nn_vxGetTensorElementNum(&inputs[0]->attr); + vsi_size_t prefix_dim_size = 1; + vsi_size_t suffix_dim_size = 0; int32_t depth = vsi_nn_kernel_param_get_int32( params, "depth" ); uint32_t data_u32[2] = {0}; float on_value = vsi_nn_kernel_param_get_float32( params, "on_value" ); @@ -379,7 +379,7 @@ static vsi_nn_kernel_node_t _setup prefix_dim_size *= inputs[0]->attr.size[i]; } - suffix_dim_size = num_elements / prefix_dim_size; + suffix_dim_size = (int32_t)(num_elements / prefix_dim_size); if (suffix_dim_size == 1) { @@ -399,11 +399,11 @@ static vsi_nn_kernel_node_t _setup } rs_tensors[0] = vsi_nn_reshape_tensor( graph, - inputs[0], (uint32_t*)shape[0], 2 ); + inputs[0], shape[0], 2 ); rs_tensors[1] = vsi_nn_reshape_tensor( graph, - outputs[0], (uint32_t*)shape[1], 3 ); + outputs[0], shape[1], 3 ); - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[1]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( rs_tensors[1]->attr.size, rs_tensors[1]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c b/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c index 603be47..a625d97 100644 --- a/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/poolwithargmax_evis.c @@ -133,7 +133,7 @@ DEF_KERNEL_INITIALIZER(_poolwithargmax_initializer) }; vsi_nn_kernel_tensor_attr_t *input_attr = NULL; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; - vsi_int_array_t * input_shape = NULL; + vsi_size_array_t * input_shape = NULL; vsi_nn_kernel_dtype_e src_dtype = F16; vsi_nn_kernel_dtype_e dst_dtype = F16; int32_t input_fl = 0; @@ -622,11 +622,11 @@ static vsi_nn_kernel_node_t _setup return NULL; } - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[1]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( outputs[1]->attr.size, outputs[1]->attr.dim_num )) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/pow_evis.c b/src/tim/vx/internal/src/kernel/evis/pow_evis.c index 35b25a0..0ffd627 100644 --- a/src/tim/vx/internal/src/kernel/evis/pow_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pow_evis.c @@ -203,11 +203,11 @@ DEF_KERNEL_INITIALIZER(_pow_initializer) uint16_t M0 = 0; uint16_t M1 = 0; - uint32_t zAx = 1; + vsi_size_t zAx = 1; uint32_t pack_key = 0; // dim number ??? vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -629,7 +629,7 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c index 42ff180..c543f96 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_bgra_evis.c @@ -113,7 +113,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer) int32_t enable_copy= 0; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -128,8 +128,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_bgra_initializer) out_shape = attr[0]->shape; dstZP = attr[0]->asymm.zero_point; outputScale = attr[0]->asymm.scale; - width = out_shape->data[0]; - height = out_shape->data[1]; + width = (uint32_t)(out_shape->data[0]); + height = (uint32_t)(out_shape->data[1]); if (reorder != 0) { @@ -446,11 +446,11 @@ static vsi_nn_kernel_node_t _setup vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_BGRA_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; - int32_t shapes[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; + vsi_size_t shapes[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; vsi_nn_tensor_t* reshape_tensors[1] = {NULL}; int32_t trans = 0; - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; @@ -480,7 +480,7 @@ static vsi_nn_kernel_node_t _setup shapes[1] = outputs[0]->attr.size[2]; reshape_tensors[0] = vsi_nn_reshape_tensor( graph, - outputs[0], (uint32_t*)shapes, outputs[0]->attr.dim_num); + outputs[0], shapes, outputs[0]->attr.dim_num); vsi_nn_kernel_node_pack_io( tmp_params, _EVIS_PRE_PROCESS_BGRA_PARAM_NUM, inputs, 1, &reshape_tensors[0], 1 ); diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c index f54396a..dc478f9 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_gray_evis.c @@ -114,7 +114,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_copy_initializer) uint32_t height = 0; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -122,8 +122,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_copy_initializer) out_shape = attr[0]->shape; dstZP = (float)attr[0]->asymm.zero_point; outputScale = attr[0]->asymm.scale; - width = out_shape->data[0]; - height = out_shape->data[1]; + width = (uint32_t)(out_shape->data[0]); + height = (uint32_t)(out_shape->data[1]); if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) { @@ -215,7 +215,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_initializer) uint32_t height = 0; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -223,8 +223,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_gray_initializer) out_shape = attr[0]->shape; dstZP = (float)attr[0]->asymm.zero_point; outputScale = attr[0]->asymm.scale; - width = out_shape->data[0]; - height = out_shape->data[1]; + width = (uint32_t)(out_shape->data[0]); + height = (uint32_t)(out_shape->data[1]); if( attr[0]->quant == VSI_NN_KERNEL_QUANT_DFP ) { @@ -436,7 +436,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t tmp_params[_EVIS_PRE_PROCESS_GRAY_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c index 8ce0467..23ae619 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_nv12_evis.c @@ -131,7 +131,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer) float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -150,8 +150,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_copy_initializer) out_shape = attr[0]->shape; dstZP = attr[0]->asymm.zero_point; dstScale = attr[0]->asymm.scale; - width = out_shape->data[0]; - height = out_shape->data[1]; + width = (uint32_t)(out_shape->data[0]); + height = (uint32_t)(out_shape->data[1]); if (reorder != 0) { @@ -295,15 +295,19 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) int32_t order1 = 2; uint32_t width = 0; uint32_t height = 0; + uint32_t roi_width = 0; + uint32_t roi_height = 0; uint32_t xrIntFloat_16 = 0; uint32_t yrIntFloat_16 = 0; + int32_t xRatio = 0; + int32_t yRatio = 0; float bMean = 0.0f, gMean= 0.0f, rMean = 0.0f, var = 0.0f; float outputScaleVar = 0.0f; float bMeanScaleVarZp = 0.0f, gMeanScaleVarZp = 0.0f, rMeanScaleVarZp = 0.0f; float resize = 0.0f; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -311,6 +315,10 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[2] ); CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &xRatio); + CHECK_STATUS_FAIL_GOTO(status, OnError ); + status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[4], &yRatio); + CHECK_STATUS_FAIL_GOTO(status, OnError ); status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[7], &rMean); CHECK_STATUS_FAIL_GOTO(status, OnError ); status = vsi_nn_kernel_scalar_read_float32((vsi_nn_kernel_scalar_t)param[8], &gMean); @@ -325,8 +333,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) out_shape = attr[1]->shape; dstZP = attr[1]->asymm.zero_point; dstScale = attr[1]->asymm.scale; - width = out_shape->data[0]; - height = out_shape->data[1]; + width = (uint32_t)(out_shape->data[0]); + height = (uint32_t)(out_shape->data[1]); if (reorder != 0) { @@ -334,9 +342,11 @@ DEF_KERNEL_INITIALIZER(_pre_process_nv12_initializer) order1 = 0; } - resize = (float)width / attr[0]->shape->data[0]; - xrIntFloat_16 = (attr[0]->shape->data[0] << 16) / width + 1; - yrIntFloat_16 = (attr[0]->shape->data[1] << 16) / height + 1; + roi_width = (xRatio * width) >> 15; + roi_height = (yRatio * height) >> 15; + resize = (float)width / roi_width; + xrIntFloat_16 = (uint32_t)((roi_width << 16) / width + 1); + yrIntFloat_16 = (uint32_t)((roi_height << 16) / height + 1); if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM) { @@ -524,7 +534,8 @@ static vsi_status _query_kernel vsi_nn_tensor_t* const* const inputs, vsi_nn_tensor_t* const* const outputs, vsi_nn_kernel_t* kernel, - const vsi_nn_kernel_param_t * params + const vsi_nn_kernel_param_t * params, + int32_t scale_x ) { vsi_nn_kernel_dtype_e input0_dtype = U8; @@ -534,9 +545,8 @@ static vsi_status _query_kernel uint32_t key = 0; int i = 0; vsi_bool enable_copy = vsi_nn_kernel_param_get_int32( params, "enable_copy" ); - uint32_t srcWidth = inputs[0]->attr.size[0]; - uint32_t dstWidth = outputs[0]->attr.size[0]; - float scaleVal = (float)dstWidth / srcWidth; + vsi_size_t dstWidth = outputs[0]->attr.size[0]; + float scaleVal = (float)dstWidth / ((scale_x * dstWidth) >> 15); uint32_t optFlg = 0; input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); @@ -605,21 +615,21 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; vsi_nn_tensor_t* reshape_tensors[1] = {NULL}; int32_t trans = 0; + int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; } - status = _query_kernel( inputs, outputs, kernel, params ); + status = _query_kernel( inputs, outputs, kernel, params, scale_x ); if ( VSI_SUCCESS == status) { node = vsi_nn_kernel_create_node( graph, kernel ); if ( node ) { uint32_t index = 3; - int32_t scale_x = vsi_nn_kernel_param_get_int32( params, "scale_x" ); int32_t scale_y = vsi_nn_kernel_param_get_int32( params, "scale_y" ); int32_t left = vsi_nn_kernel_param_get_int32( params, "left" ); int32_t top = vsi_nn_kernel_param_get_int32( params, "top" ); diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c index a7a6cb1..4181414 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb_evis.c @@ -137,7 +137,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) uint32_t pack_key = 0; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -152,8 +152,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_rgb_initializer) out_shape = attr[0]->shape; outputZP = (float)attr[0]->asymm.zero_point; outputScale = attr[0]->asymm.scale; - width = out_shape->data[0]; - height = out_shape->data[1]; + width = (uint32_t)(out_shape->data[0]); + height = (uint32_t)(out_shape->data[1]); if (reorder != 0) { @@ -581,7 +581,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_tensor_t* reshape_tensors[1] = {NULL}; int32_t trans = 0; - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c index 7ab900b..7a5c50c 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv420_evis.c @@ -124,7 +124,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer) uint32_t height = 0; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -137,8 +137,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_copy_initializer) out_shape = attr[0]->shape; dstZP = attr[0]->asymm.zero_point; dstScale = attr[0]->asymm.scale; - width = out_shape->data[0]; - height = out_shape->data[1]; + width = (uint32_t)(out_shape->data[0]); + height = (uint32_t)(out_shape->data[1]); if (reorder != 0) { @@ -494,7 +494,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer) uint32_t height = 0; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -505,8 +505,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv420_initializer) out_shape = attr[0]->shape; dstZP = attr[0]->asymm.zero_point; dstScale = attr[0]->asymm.scale; - width = out_shape->data[0]; - height = out_shape->data[1]; + width = (uint32_t)(out_shape->data[0]); + height = (uint32_t)(out_shape->data[1]); if (reorder != 0) { @@ -924,7 +924,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_tensor_t* reshape_tensors[1] = {NULL}; int32_t trans = 0; - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c index 262aa5d..d96e81d 100644 --- a/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/pre_process_yuv444_evis.c @@ -121,7 +121,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer) uint32_t height = 0; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -132,8 +132,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_copy_initializer) out_shape = attr[0]->shape; dstZP = attr[0]->asymm.zero_point; dstScale = attr[0]->asymm.scale; - width = out_shape->data[0]; - height = out_shape->data[1]; + width = (uint32_t)(out_shape->data[0]); + height = (uint32_t)(out_shape->data[1]); if (reorder != 0) { @@ -475,7 +475,7 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer) uint32_t height = 0; vsi_nn_kernel_tensor_attr_t * attr[1] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[3] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); @@ -486,8 +486,8 @@ DEF_KERNEL_INITIALIZER(_pre_process_yuv444_initializer) out_shape = attr[0]->shape; dstZP = attr[0]->asymm.zero_point; dstScale = attr[0]->asymm.scale; - width = out_shape->data[0]; - height = out_shape->data[1]; + width = (uint32_t)(out_shape->data[0]); + height = (uint32_t)(out_shape->data[1]); if (reorder != 0) { @@ -899,7 +899,7 @@ static vsi_nn_kernel_node_t _setup vsi_nn_tensor_t* reshape_tensors[1] = {NULL}; int32_t trans = 0; - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c index e74fb72..2bed1e4 100644 --- a/src/tim/vx/internal/src/kernel/evis/prelu_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/prelu_evis.c @@ -138,7 +138,7 @@ DEF_KERNEL_INITIALIZER(_prelu_initializer) uint32_t evis_version = 0; vsi_nn_kernel_tensor_attr_t * attr[3] = { NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; uint32_t pack_key; vx_context ctx = vxGetContext((vx_reference)node); vx_hardware_caps_params_t hw_param; @@ -578,8 +578,8 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; - int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; - uint32_t new_rank = 0; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + vsi_size_t new_rank = 0; vsi_bool ret; int32_t is_per_channel_alpha = 0; @@ -591,26 +591,26 @@ static vsi_nn_kernel_node_t _setup } ret = vsi_nn_kernel_optimize_eltwise_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, - (int32_t *)inputs[1]->attr.size, inputs[1]->attr.dim_num, - (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[1]->attr.size, inputs[1]->attr.dim_num, + outputs[0]->attr.size, outputs[0]->attr.dim_num, shapes[0], shapes[1], shapes[2], &new_rank ); if (ret) { reshape_tensors[0] = vsi_nn_reshape_tensor( graph, - inputs[0], (uint32_t*)shapes[0], new_rank ); + inputs[0], shapes[0], (uint32_t)new_rank ); reshape_tensors[1] = vsi_nn_reshape_tensor( graph, - inputs[1], (uint32_t*)shapes[1], new_rank ); + inputs[1], shapes[1], (uint32_t)new_rank ); reshape_tensors[2] = vsi_nn_reshape_tensor( graph, - outputs[0], (uint32_t*)shapes[2], new_rank ); + outputs[0], shapes[2], (uint32_t)new_rank ); } else { return NULL; } - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[2]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[2]->attr.size, reshape_tensors[2]->attr.dim_num ) ) { goto final; diff --git a/src/tim/vx/internal/src/kernel/evis/random_multinomial_evis.c b/src/tim/vx/internal/src/kernel/evis/random_multinomial_evis.c index 721b835..daa4060 100644 --- a/src/tim/vx/internal/src/kernel/evis/random_multinomial_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/random_multinomial_evis.c @@ -149,7 +149,7 @@ DEF_KERNEL_INITIALIZER(_multinomial_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t * attr = NULL; - vsi_int_array_t * in_shape = NULL; + vsi_size_array_t * in_shape = NULL; attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final ); @@ -191,7 +191,7 @@ DEF_KERNEL_INITIALIZER(_cdf_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t * attr = NULL; - vsi_int_array_t * in_shape = NULL; + vsi_size_array_t * in_shape = NULL; uint32_t class_max_iter = 0; uint32_t class_size = 0; uint32_t batch = 0; @@ -201,8 +201,8 @@ DEF_KERNEL_INITIALIZER(_cdf_initializer) in_shape = attr->shape; - class_size = in_shape->data[0]; - batch = in_shape->data[1]; + class_size = (uint32_t)(in_shape->data[0]); + batch = (uint32_t)(in_shape->data[1]); if (attr->dtype == F32) { class_max_iter = (class_size + 3) >> 2; @@ -286,7 +286,7 @@ DEF_KERNEL_INITIALIZER(_seed_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t * attr = NULL; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; uint32_t stride = 0; uint32_t iter = 8; float rand_max = (float)(pow(2.0,32)); @@ -296,7 +296,7 @@ DEF_KERNEL_INITIALIZER(_seed_initializer) CHECK_PTR_FAIL_GOTO( attr, "Create tensor attr buffer fail.", final ); out_shape = attr->shape; - iter = (out_shape->data[0] + 3) / 4; + iter = (uint32_t)((out_shape->data[0] + 3) / 4); stride = iter * 4; @@ -420,14 +420,14 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_t * ikernels[INTERNAL_KERNEL_SIZE] = { NULL }; vsi_nn_tensor_t * tensors[INTERNAL_KERNEL_SIZE] = { NULL }; int32_t class_max_stride = 0; - int32_t class_size = 0; + vsi_size_t class_size = 0; uint32_t hashkeys[INTERNAL_KERNEL_SIZE] = { 0 }; uint32_t hashkey = 0; int32_t i; // Check if gpu can support the size if( !vsi_nn_kernel_gpu_check_shape( - (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) + outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; } @@ -463,7 +463,7 @@ static vsi_nn_kernel_node_t _setup attr.size[1] = 1; attr.dim_num = 2; tensors[SEEDS_INDEX] = vsi_nn_reshape_tensor( graph, - inputs[1], (uint32_t*)attr.size, attr.dim_num ); + inputs[1], attr.size, attr.dim_num ); in0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type ); in1_dtype = vsi_nn_kernel_map_dtype( inputs[1]->attr.dtype.vx_type ); diff --git a/src/tim/vx/internal/src/kernel/evis/reduceall_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reduceall_internal_evis.c index 845a692..caf40b9 100644 --- a/src/tim/vx/internal/src/kernel/evis/reduceall_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/reduceall_internal_evis.c @@ -107,8 +107,8 @@ DEF_KERNEL_INITIALIZER(_reduceall_internal_initializer) int32_t axis = 0; vsi_nn_kernel_tensor_attr_t *input_attr = NULL; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; - vsi_int_array_t * input_shape = NULL; - vsi_int_array_t * output_shape = NULL; + vsi_size_array_t * input_shape = NULL; + vsi_size_array_t * output_shape = NULL; int32_t axisSize = 0; input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); @@ -143,7 +143,7 @@ DEF_KERNEL_INITIALIZER(_reduceall_internal_initializer) (output_shape->data[1] + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1]); gpu_param.global_size[2] = 1; - axisSize = input_shape->data[axis]; + axisSize = (int32_t)(input_shape->data[axis]); { gpu_dp_inst_t uniS8AddAll_16x1 = {{ @@ -260,9 +260,9 @@ static vsi_nn_kernel_node_t _setup axis = vsi_nn_kernel_param_get_int32(params, "axis"); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) || axis > 2) { diff --git a/src/tim/vx/internal/src/kernel/evis/reduceany_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reduceany_internal_evis.c index 4773ee5..df45307 100644 --- a/src/tim/vx/internal/src/kernel/evis/reduceany_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/reduceany_internal_evis.c @@ -107,8 +107,8 @@ DEF_KERNEL_INITIALIZER(_reduceany_internal_initializer) int32_t axis = 0; vsi_nn_kernel_tensor_attr_t *input_attr = NULL; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; - vsi_int_array_t * input_shape = NULL; - vsi_int_array_t * output_shape = NULL; + vsi_size_array_t * input_shape = NULL; + vsi_size_array_t * output_shape = NULL; int32_t axisSize = 0; input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); @@ -143,7 +143,7 @@ DEF_KERNEL_INITIALIZER(_reduceany_internal_initializer) (output_shape->data[1] + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1]); gpu_param.global_size[2] = 1; - axisSize = input_shape->data[axis]; + axisSize = (int32_t)(input_shape->data[axis]); { gpu_dp_inst_t uniS8AddAll_16x1 = {{ @@ -260,9 +260,9 @@ static vsi_nn_kernel_node_t _setup axis = vsi_nn_kernel_param_get_int32(params, "axis"); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) || axis > 2) { diff --git a/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c index 604bacc..39b9649 100644 --- a/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/reducemax_internal_evis.c @@ -150,8 +150,8 @@ DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer) int32_t axis = 0; vsi_nn_kernel_tensor_attr_t *input_attr = NULL; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; - vsi_int_array_t * input_shape = NULL; - vsi_int_array_t * output_shape = NULL; + vsi_size_array_t * input_shape = NULL; + vsi_size_array_t * output_shape = NULL; int32_t input_fl = 0, output_fl = 0; int32_t axisSize = 0; float inputScale = 1.0f; @@ -191,7 +191,7 @@ DEF_KERNEL_INITIALIZER(_reducemax_internal_initializer) (output_shape->data[1] + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1]); gpu_param.global_size[2] = 1; - axisSize = input_shape->data[axis]; + axisSize = (int32_t)(input_shape->data[axis]); { gpu_dp_inst_t uniPackMaxData_2x8 = {{ @@ -391,9 +391,9 @@ static vsi_nn_kernel_node_t _setup axis = vsi_nn_kernel_param_get_int32(params, "axis"); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) || axis > 2) { diff --git a/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c index e5eedc4..7ec74d5 100644 --- a/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/reducemin_internal_evis.c @@ -152,8 +152,8 @@ DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer) int32_t axis = 0; vsi_nn_kernel_tensor_attr_t *input_attr = NULL; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; - vsi_int_array_t * input_shape = NULL; - vsi_int_array_t * output_shape = NULL; + vsi_size_array_t * input_shape = NULL; + vsi_size_array_t * output_shape = NULL; int32_t input_fl = 0, output_fl = 0; int32_t axisSize = 0; float inputScale = 1.0f; @@ -193,7 +193,7 @@ DEF_KERNEL_INITIALIZER(_reducemin_internal_initializer) (output_shape->data[1] + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1]); gpu_param.global_size[2] = 1; - axisSize = input_shape->data[axis]; + axisSize = (uint32_t)(input_shape->data[axis]); { gpu_dp_inst_t uniPackMaxData_2x8 = {{ @@ -394,9 +394,9 @@ static vsi_nn_kernel_node_t _setup axis = vsi_nn_kernel_param_get_int32(params, "axis"); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) || axis > 2) { diff --git a/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c index d665ac7..bbdf29e 100644 --- a/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/reduceprod_internal_evis.c @@ -156,8 +156,8 @@ DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer) int32_t axis = 0; vsi_nn_kernel_tensor_attr_t *input_attr = NULL; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; - vsi_int_array_t * input_shape = NULL; - vsi_int_array_t * output_shape = NULL; + vsi_size_array_t * input_shape = NULL; + vsi_size_array_t * output_shape = NULL; vsi_nn_kernel_dtype_e src_dtype = F16; vsi_nn_kernel_dtype_e dst_dtype = F16; int32_t input_fl = 0, output_fl = 0; @@ -201,7 +201,7 @@ DEF_KERNEL_INITIALIZER(_reduceprod_internal_initializer) (output_shape->data[1] + gpu_param.global_scale[1] - 1) / gpu_param.global_scale[1]); gpu_param.global_size[2] = 1; - axisSize = input_shape->data[axis]; + axisSize = (int32_t)(input_shape->data[axis]); { gpu_dp_inst_t uniGetLoData_4x4 = {{ @@ -476,9 +476,9 @@ static vsi_nn_kernel_node_t _setup axis = vsi_nn_kernel_param_get_int32(params, "axis"); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) || axis > 2) { diff --git a/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c b/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c index e454e80..d7cb58d 100644 --- a/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/relu_keras_evis.c @@ -127,7 +127,7 @@ DEF_KERNEL_INITIALIZER(_relu_keras_initializer) }; vsi_nn_kernel_tensor_attr_t * output_attr = NULL; vsi_nn_kernel_tensor_attr_t * input_attr = NULL; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; vsi_nn_kernel_dtype_e input_dtype = F16; vsi_nn_kernel_dtype_e output_dtype = F16; float alpha = 0.0f; @@ -437,7 +437,7 @@ static vsi_nn_kernel_node_t _setup float max_value = vsi_nn_kernel_param_get_float32( params, "max_value" ); float threshold = vsi_nn_kernel_param_get_float32( params, "threshold" ); - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/repeat_evis.c b/src/tim/vx/internal/src/kernel/evis/repeat_evis.c index 421caca..35d2b63 100644 --- a/src/tim/vx/internal/src/kernel/evis/repeat_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/repeat_evis.c @@ -151,7 +151,7 @@ DEF_KERNEL_INITIALIZER(_preprocess_initializer) attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); - width = attr[0]->shape->data[0]; + width = (int32_t)(attr[0]->shape->data[0]); shaderParam.global_scale[0] = 16; shaderParam.global_scale[1] = 1; @@ -208,7 +208,7 @@ DEF_KERNEL_INITIALIZER(_repeat_initializer) {0, 0, 0}}; // globalWorkSize: image size in thread vsi_nn_kernel_tensor_attr_t* attr[1] = {NULL}; - vsi_int_array_t * input_shape = NULL; + vsi_size_array_t * input_shape = NULL; int32_t height = 0, width = 0, chn = 0; int32_t is1d = 0; int32_t axis = 0; @@ -220,13 +220,13 @@ DEF_KERNEL_INITIALIZER(_repeat_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); input_shape = attr[0]->shape; - width = input_shape->data[0]; - height = input_shape->data[1]; + width = (int32_t)(input_shape->data[0]); + height = (int32_t)(input_shape->data[1]); if (height == 1 && input_shape->size == 2) { is1d = 1; } - chn = input_shape->size > 2 ? input_shape->data[2] : 1; + chn = (int32_t)(input_shape->size > 2 ? input_shape->data[2] : 1); if ((axis == 0 && is1d == 0) || axis == 2) { @@ -383,9 +383,9 @@ static int32_t _optimize_repeat_shape vsi_nn_tensor_t ** inputs, vsi_nn_tensor_t ** outputs, int32_t* axis, - int32_t* opt_shape_in, - int32_t* opt_shape_out, - int32_t* new_rank + vsi_size_t* opt_shape_in, + vsi_size_t* opt_shape_out, + vsi_size_t* new_rank ) { vsi_status status = VSI_SUCCESS; @@ -401,7 +401,7 @@ static int32_t _optimize_repeat_shape } else if (axis[0] == 3) { - vsi_nn_kernel_optimize_element_shape( (int32_t*)inputs[0]->attr.size, 3, opt_shape_in, new_rank ); + vsi_nn_kernel_optimize_element_shape( inputs[0]->attr.size, 3, opt_shape_in, new_rank ); if (opt_shape_in[1] == 1) { opt_shape_in[1] = inputs[0]->attr.size[3]; @@ -450,13 +450,13 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_t * kernel_preprocess = NULL; vsi_nn_tensor_t * tensor_preprocess = NULL; vsi_nn_kernel_tensor_t rs_input = NULL, rs_input1 = NULL, rs_output = NULL; - int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }, { 1, 1, 1, 1 }}; - int32_t new_rank[2] = {0, 0}; + vsi_size_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }, { 1, 1, 1, 1 }}; + vsi_size_t new_rank[2] = {0, 0}; int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); // Check if gpu can support the size if ( !vsi_nn_kernel_gpu_check_shape( - (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) + outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; } diff --git a/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c index f1166dd..f893fea 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_1d_bilinear_evis.c @@ -190,8 +190,8 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer) }; vsi_nn_kernel_tensor_attr_t * output_attr = NULL; vsi_nn_kernel_tensor_attr_t * input_attr = NULL; - vsi_int_array_t * out_shape = NULL; - vsi_int_array_t * in_shape = NULL; + vsi_size_array_t * out_shape = NULL; + vsi_size_array_t * in_shape = NULL; vsi_nn_kernel_dtype_e input_dtype = F16; vsi_nn_kernel_dtype_e output_dtype = F16; int32_t align_corners = 0; @@ -237,10 +237,10 @@ DEF_KERNEL_INITIALIZER(_resize_1d_bilinear_initializer) input_dtype = input_attr->dtype; output_dtype = output_attr->dtype; - in_width = in_shape->data[0]; - depth = in_shape->data[2]; - out_width = out_shape->data[0]; - out_height = out_shape->data[1]; + in_width = (uint32_t)(in_shape->data[0]); + depth = (uint32_t)(in_shape->data[2]); + out_width = (uint32_t)(out_shape->data[0]); + out_height = (uint32_t)(out_shape->data[1]); if (align_corners && out_width > 1) { @@ -1185,13 +1185,13 @@ static vsi_nn_tensor_t* _create_scale_tensor vsi_nn_tensor_attr_t attr; vsi_nn_tensor_t* scale = NULL; uint32_t dims = output->attr.dim_num; - uint32_t batch = dims > 3 ? output->attr.size[3] : 1; - uint32_t width = output->attr.size[0]; - uint32_t sizes[4] = {width * 2, 1, 1, batch}; - uint32_t item_count = width * 2 * batch; - uint32_t input_width = input->attr.size[0]; - uint32_t x = 0; - uint32_t b = 0; + vsi_size_t batch = dims > 3 ? output->attr.size[3] : 1; + vsi_size_t width = output->attr.size[0]; + vsi_size_t sizes[4] = {width * 2, 1, 1, batch}; + vsi_size_t item_count = width * 2 * batch; + vsi_size_t input_width = input->attr.size[0]; + vsi_size_t x = 0; + vsi_size_t b = 0; float width_scale = 1.0f; uint16_t *scale_data_ptr = NULL; @@ -1217,7 +1217,7 @@ static vsi_nn_tensor_t* _create_scale_tensor { float input_w = 0.0f; int32_t w0 = 0; - uint32_t idx = b * width * 2 + x * 2; + size_t idx = b * width * 2 + x * 2; float tl = 0.0f; float tr = 0.0f; if (half_pixel_centers) diff --git a/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c index 9fc49ee..7a3eeed 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_1d_nearest_evis.c @@ -124,8 +124,8 @@ DEF_KERNEL_INITIALIZER(_resize_1d_nearest_initializer) }; vsi_nn_kernel_tensor_attr_t * output_attr = NULL; vsi_nn_kernel_tensor_attr_t * input_attr = NULL; - vsi_int_array_t * out_shape = NULL; - vsi_int_array_t * in_shape = NULL; + vsi_size_array_t * out_shape = NULL; + vsi_size_array_t * in_shape = NULL; vsi_nn_kernel_dtype_e input_dtype = F16; vsi_nn_kernel_dtype_e output_dtype = F16; int32_t align_corners = 0; @@ -158,10 +158,10 @@ DEF_KERNEL_INITIALIZER(_resize_1d_nearest_initializer) input_dtype = input_attr->dtype; output_dtype = output_attr->dtype; - in_width = in_shape->data[0]; - depth = in_shape->data[2]; - out_width = out_shape->data[0]; - out_height = out_shape->data[1]; + in_width = (uint32_t)(in_shape->data[0]); + depth = (uint32_t)(in_shape->data[2]); + out_width = (uint32_t)(out_shape->data[0]); + out_height = (uint32_t)(out_shape->data[1]); if (BF16 == input_dtype && output_dtype == BF16) { @@ -427,8 +427,8 @@ static vsi_status _query_kernel uint32_t key = 0; uint32_t i = 0; - uint32_t inputWidth = inputs[0]->attr.size[0]; - uint32_t outputWidth = outputs[0]->attr.size[0]; + vsi_size_t inputWidth = inputs[0]->attr.size[0]; + vsi_size_t outputWidth = outputs[0]->attr.size[0]; float scale_factor; _internal_nearest_e resize_mode = LARGE; diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c index 194fb3b..a4e4fa9 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c @@ -199,8 +199,8 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) }; vsi_nn_kernel_tensor_attr_t * output_attr = NULL; vsi_nn_kernel_tensor_attr_t * input_attr = NULL; - vsi_int_array_t * out_shape = NULL; - vsi_int_array_t * in_shape = NULL; + vsi_size_array_t * out_shape = NULL; + vsi_size_array_t * in_shape = NULL; vsi_nn_kernel_dtype_e input_dtype = F16; vsi_nn_kernel_dtype_e output_dtype = F16; int32_t align_corners; @@ -240,11 +240,11 @@ DEF_KERNEL_INITIALIZER(_resize_bilinear_initializer) input_dtype = input_attr->dtype; output_dtype = output_attr->dtype; - in_width = in_shape->data[0]; - in_height = in_shape->data[1]; - depth = in_shape->data[2]; - out_width = out_shape->data[0]; - out_height = out_shape->data[1]; + in_width = (uint32_t)(in_shape->data[0]); + in_height = (uint32_t)(in_shape->data[1]); + depth = (uint32_t)(in_shape->data[2]); + out_width = (uint32_t)(out_shape->data[0]); + out_height = (uint32_t)(out_shape->data[1]); if (align_corners && out_width > 1) { @@ -1138,16 +1138,16 @@ static vsi_nn_tensor_t* _create_scale_tensor vsi_nn_tensor_attr_t attr; vsi_nn_tensor_t* scale = NULL; uint32_t dims = output->attr.dim_num; - uint32_t width = output->attr.size[0]; - uint32_t height = output->attr.size[1]; - uint32_t batch = dims > 3 ? output->attr.size[3] : 1; - uint32_t sizes[4] = {width * 4, height, 1, batch}; - uint32_t item_count = width * 4 * height * batch; - uint32_t input_width = input->attr.size[0]; - uint32_t input_height = input->attr.size[1]; - uint32_t x = 0; - uint32_t y = 0; - uint32_t b = 0; + vsi_size_t width = output->attr.size[0]; + vsi_size_t height = output->attr.size[1]; + vsi_size_t batch = dims > 3 ? output->attr.size[3] : 1; + vsi_size_t sizes[4] = {width * 4, height, 1, batch}; + vsi_size_t item_count = width * 4 * height * batch; + vsi_size_t input_width = input->attr.size[0]; + vsi_size_t input_height = input->attr.size[1]; + vsi_size_t x = 0; + vsi_size_t y = 0; + vsi_size_t b = 0; float width_scale = 1.0f; float height_scale = 1.0f; uint16_t *scale_data_ptr = NULL; @@ -1198,7 +1198,7 @@ static vsi_nn_tensor_t* _create_scale_tensor { float input_w = 0.0f; int32_t w0 = 0; - uint32_t idx = b * width * 4 * height + y * width * 4 + x * 4; + vsi_size_t idx = b * width * 4 * height + y * width * 4 + x * 4; float tl = 0.0f; float tr = 0.0f; float bl = 0.0f; diff --git a/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c index 9b485a0..1b6d094 100644 --- a/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/resize_nearest_evis.c @@ -124,8 +124,8 @@ DEF_KERNEL_INITIALIZER(_resize_nearest_initializer) }; vsi_nn_kernel_tensor_attr_t * output_attr = NULL; vsi_nn_kernel_tensor_attr_t * input_attr = NULL; - vsi_int_array_t * out_shape = NULL; - vsi_int_array_t * in_shape = NULL; + vsi_size_array_t * out_shape = NULL; + vsi_size_array_t * in_shape = NULL; vsi_nn_kernel_dtype_e input_dtype = F16; vsi_nn_kernel_dtype_e output_dtype = F16; int32_t align_corners; @@ -159,11 +159,11 @@ DEF_KERNEL_INITIALIZER(_resize_nearest_initializer) input_dtype = input_attr->dtype; output_dtype = output_attr->dtype; - in_width = in_shape->data[0]; - in_height = in_shape->data[1]; - depth = in_shape->data[2]; - out_width = out_shape->data[0]; - out_height = out_shape->data[1]; + in_width = (uint32_t)(in_shape->data[0]); + in_height = (uint32_t)(in_shape->data[1]); + depth = (uint32_t)(in_shape->data[2]); + out_width = (uint32_t)(out_shape->data[0]); + out_height = (uint32_t)(out_shape->data[1]); if (BF16 == input_dtype && output_dtype == BF16) { @@ -436,8 +436,8 @@ static vsi_status _query_kernel vx_kernel_initialize_f initializer = _resize_nearest_initializer; uint32_t key; uint32_t i; - uint32_t inputWidth = inputs[0]->attr.size[0]; - uint32_t outputWidth = outputs[0]->attr.size[0]; + vsi_size_t inputWidth = inputs[0]->attr.size[0]; + vsi_size_t outputWidth = outputs[0]->attr.size[0]; float scale_factor; _internal_nearest_e resize_mode = LARGE; diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c index 287fc73..fe8a9d7 100644 --- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_evis.c @@ -98,20 +98,20 @@ static vx_param_description_t _scatter_nd_kernel_param_def[] = static vsi_status get_scatter_nd_tensor_reshape_size ( vsi_nn_tensor_t ** inputs, - int32_t sizes[VSI_NN_MAX_DIM_NUM], + vsi_size_t sizes[VSI_NN_MAX_DIM_NUM], uint32_t block_size, uint32_t coordDim, - uint32_t* width, - uint32_t* area, + vsi_size_t* width, + vsi_size_t* area, int32_t* newDim, int32_t* isBig ) { vsi_status status = VSI_FAILURE; uint32_t dims_num = inputs[0]->attr.dim_num; - uint32_t *input_size = inputs[0]->attr.size; + vsi_size_t *input_size = inputs[0]->attr.size; uint32_t i = 0; - uint32_t elementCnt = 1; + vsi_size_t elementCnt = 1; if(coordDim != 0 && (width == NULL || area == NULL)) { @@ -202,10 +202,10 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_initializer) status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &coord_dim); CHECK_STATUS_FAIL_GOTO(status, OnError ); - block_size = attr[2]->shape->data[0]; - height = attr[2]->shape->data[1]; - index_num = attr[0]->shape->data[1]; - output_zp = attr[2]->asymm.zero_point; + block_size = (int32_t)(attr[2]->shape->data[0]); + height = (int32_t)(attr[2]->shape->data[1]); + index_num = (int32_t)(attr[0]->shape->data[1]); + output_zp = (int32_t)(attr[2]->asymm.zero_point); if(coord_dim == 3) { @@ -318,10 +318,10 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_big_initializer) status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &coord_dim); CHECK_STATUS_FAIL_GOTO(status, OnError ); - block_size = attr[2]->shape->data[0]; - height = attr[2]->shape->data[1]; - index_num = attr[0]->shape->data[1]; - output_zp = attr[2]->asymm.zero_point; + block_size = (int32_t)(attr[2]->shape->data[0]); + height = (int32_t)(attr[2]->shape->data[1]); + index_num = (int32_t)(attr[0]->shape->data[1]); + output_zp = (int32_t)(attr[2]->asymm.zero_point); if(coord_dim == 3) { @@ -465,11 +465,11 @@ static vsi_nn_kernel_node_t _setup vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t tmp_params[_SCATTER_ND_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; - int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" ); int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0; - uint32_t width = 0, area = 0; + vsi_size_t width = 0, area = 0; int32_t big_flg = 0; status = get_scatter_nd_tensor_reshape_size(&inputs[0], shapes[0], coord_dim, 0, diff --git a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c index 2cb231c..e4a497a 100644 --- a/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/scatter_nd_update_evis.c @@ -185,21 +185,21 @@ static vx_param_description_t _scatter_nd_update_post_kernel_param_def[] = static vsi_status get_scatter_nd_update_tensor_reshape_size ( vsi_nn_tensor_t ** inputs, - int32_t sizes[VSI_NN_MAX_DIM_NUM], + vsi_size_t sizes[VSI_NN_MAX_DIM_NUM], uint32_t block_size, uint32_t coordDim, - uint32_t* width, - uint32_t* area, - uint32_t* vol, + vsi_size_t* width, + vsi_size_t* area, + vsi_size_t* vol, int32_t* newDim, int32_t* isBig ) { vsi_status status = VSI_FAILURE; uint32_t dims_num = inputs[0]->attr.dim_num; - uint32_t *input_size = inputs[0]->attr.size; + vsi_size_t *input_size = inputs[0]->attr.size; uint32_t i = 0; - uint32_t elementCnt = 1; + vsi_size_t elementCnt = 1; if (coordDim != 0 && (width == NULL || area == NULL)) { @@ -311,9 +311,9 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_initializer) status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &coord_dim); CHECK_STATUS_FAIL_GOTO(status, OnError ); - block_size = attr[3]->shape->data[0]; - height = attr[3]->shape->data[1]; - index_num = attr[1]->shape->data[1]; + block_size = (int32_t)(attr[3]->shape->data[0]); + height = (int32_t)(attr[3]->shape->data[1]); + index_num = (int32_t)(attr[1]->shape->data[1]); if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) { @@ -602,9 +602,9 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_big_initializer) status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &coord_dim); CHECK_STATUS_FAIL_GOTO(status, OnError ); - block_size = attr[3]->shape->data[0]; - height = attr[3]->shape->data[1]; - index_num = attr[1]->shape->data[1]; + block_size = (int32_t)(attr[3]->shape->data[0]); + height = (int32_t)(attr[3]->shape->data[1]); + index_num = (int32_t)(attr[1]->shape->data[1]); if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) { @@ -852,9 +852,9 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_pre_initializer) status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &coord_dim); CHECK_STATUS_FAIL_GOTO(status, OnError ); - block_size = attr[2]->shape->data[0]; - update_width = attr[1]->shape->data[0]; - index_num = attr[0]->shape->data[1]; + block_size = (int32_t)(attr[2]->shape->data[0]); + update_width = (int32_t)(attr[1]->shape->data[0]); + index_num = (int32_t)(attr[0]->shape->data[1]); if (attr[1]->quant == VSI_NN_KERNEL_QUANT_ASYMM) { @@ -1009,8 +1009,8 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_post_initializer) status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &coord_dim); CHECK_STATUS_FAIL_GOTO(status, OnError ); - block_size = attr[2]->shape->data[0]; - height = attr[2]->shape->data[1]; + block_size = (int32_t)(attr[2]->shape->data[0]); + height = (int32_t)(attr[2]->shape->data[1]); if (attr[0]->quant == VSI_NN_KERNEL_QUANT_ASYMM) { @@ -1213,10 +1213,10 @@ DEF_KERNEL_INITIALIZER(_scatter_nd_update_reset_initializer) attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", OnError ); - block_size = attr[0]->shape->data[0]; - height = attr[0]->shape->data[1]; - width = block_size * height; - count_width = (height + 3) / 4; + block_size = (int32_t)(attr[0]->shape->data[0]); + height = (int32_t)(attr[0]->shape->data[1]); + width = (int32_t)(block_size * height); + count_width = (int32_t)((height + 3) / 4); gpu_param.global_scale[0] = 1; gpu_param.global_scale[1] = 1; @@ -1416,11 +1416,11 @@ static vsi_nn_kernel_node_t _setup vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t tmp_params[_SCATTER_ND_UPDATE_PARAM_NUM] = { NULL }; vsi_nn_kernel_node_t node = NULL; - int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{0}}; int32_t block_size = vsi_nn_kernel_param_get_int32( params, "block_size" ); int32_t coord_dim = vsi_nn_kernel_param_get_int32( params, "coord_dim" ); int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0; - uint32_t width = 0, area = 0, vol = 0; + vsi_size_t width = 0, area = 0, vol = 0; int32_t big_flg = 0; vsi_nn_kernel_dtype_e update_dtype = vsi_nn_kernel_map_dtype(inputs[2]->attr.dtype.vx_type); int32_t i = 0; diff --git a/src/tim/vx/internal/src/kernel/evis/select_evis.c b/src/tim/vx/internal/src/kernel/evis/select_evis.c index e0975d3..f5571ab 100644 --- a/src/tim/vx/internal/src/kernel/evis/select_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/select_evis.c @@ -122,7 +122,7 @@ DEF_KERNEL_INITIALIZER(_select_initializer) vsi_nn_kernel_tensor_attr_t *input0_attr = NULL; vsi_nn_kernel_tensor_attr_t *input1_attr = NULL; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; - vsi_int_array_t *output_shape = NULL; + vsi_size_array_t *output_shape = NULL; int32_t input0_fl = 0, input1_fl = 0, output_fl = 0; float input0Scale = 1.0f; int32_t input0Zp = 0; @@ -474,7 +474,7 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c b/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c index 07c7266..5d7e2d6 100644 --- a/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/sequence_mask_evis.c @@ -116,7 +116,7 @@ DEF_KERNEL_INITIALIZER(_sequence_mask_initializer) }; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL, NULL }; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; float scaleIn = 1.0f; float scaleOut = 1.0f; float outputVal1 = 1.0f; @@ -291,14 +291,14 @@ static int32_t _optimize_mask_shape vsi_nn_tensor_t ** inputs, vsi_nn_tensor_t ** outputs, int32_t max_len, - int32_t* opt_shape_in, - int32_t* opt_shape_out, + vsi_size_t* opt_shape_in, + vsi_size_t* opt_shape_out, int32_t* is2Dflg ) { vsi_status status = VSI_SUCCESS; - int32_t in_shape[VSI_NN_MAX_DIM_NUM] = {0}; - int32_t new_rank = 0; + vsi_size_t in_shape[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t new_rank = 0; uint32_t i = 0; for(i = 0; i < inputs[0]->attr.dim_num; i++) @@ -340,11 +340,11 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_param_t tmp_params[_EVIS_PARAM_NUM] = {NULL}; vsi_nn_kernel_node_t node = NULL; vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL; - int32_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }, { 1, 1, 1, 1 }}; + vsi_size_t new_shape[2][VSI_NN_MAX_DIM_NUM] = {{ 1, 1, 1, 1 }, { 1, 1, 1, 1 }}; int32_t max_len = vsi_nn_kernel_param_get_int32( params, "max_len" ); int32_t is2Dflg = 0; - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/signal_frame_evis.c b/src/tim/vx/internal/src/kernel/evis/signal_frame_evis.c index 0fdf6a8..bcfe0d0 100644 --- a/src/tim/vx/internal/src/kernel/evis/signal_frame_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/signal_frame_evis.c @@ -93,7 +93,7 @@ DEF_KERNEL_INITIALIZER(_signal_frame_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t * attr = NULL; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] ); out_shape = attr->shape; @@ -187,14 +187,14 @@ static vsi_nn_kernel_node_t _setup int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" ); int32_t pad_end = vsi_nn_kernel_param_get_int32( params, "pad_end" ); float pad_value = vsi_nn_kernel_param_get_float32( params, "pad_val" ); - int32_t num_frames = outputs[0]->attr.size[axis + 1]; + vsi_size_t num_frames = outputs[0]->attr.size[axis + 1]; int32_t rank = inputs[0]->attr.dim_num; - int32_t inner = 1; - int32_t outer = 1; - int32_t length_samples = inputs[0]->attr.size[axis]; + vsi_size_t inner = 1; + vsi_size_t outer = 1; + vsi_size_t length_samples = inputs[0]->attr.size[axis]; int32_t i = 0; vsi_nn_tensor_t* rs_tensors[2] = { NULL }; - int32_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }}; + vsi_size_t shape[2][VSI_NN_MAX_DIM_NUM] = {{ 0 }}; for (i = 0; i < axis; i++) { @@ -217,11 +217,11 @@ static vsi_nn_kernel_node_t _setup shape[1][3] = outer; rs_tensors[0] = vsi_nn_reshape_tensor( graph, - inputs[0], (uint32_t*)shape[0], 4 ); + inputs[0], shape[0], 4 ); rs_tensors[1] = vsi_nn_reshape_tensor( graph, - outputs[0], (uint32_t*)shape[1], 4 ); + outputs[0], shape[1], 4 ); - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)rs_tensors[1]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( rs_tensors[1]->attr.size, rs_tensors[1]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/slice_evis.c b/src/tim/vx/internal/src/kernel/evis/slice_evis.c index 69bfb4b..b9f570b 100644 --- a/src/tim/vx/internal/src/kernel/evis/slice_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/slice_evis.c @@ -139,7 +139,7 @@ DEF_KERNEL_INITIALIZER(_slice_initializer) }; vsi_nn_kernel_tensor_attr_t * output_attr = NULL; vsi_nn_kernel_tensor_attr_t * input_attr = NULL; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; vsi_nn_kernel_dtype_e input_dtype = F16; vsi_nn_kernel_dtype_e output_dtype = F16; float scaleIn = 1.0f; @@ -393,29 +393,29 @@ static vsi_nn_kernel_node_t _setup vsi_nn_kernel_node_t node = NULL; vsi_bool image_2d = FALSE; uint32_t rank[_IO_NUM] = {0}; - int32_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + vsi_size_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL }; int32_t i = 0; - int32_t input_batch = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; - int32_t output_batch = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1; + vsi_size_t input_batch = inputs[0]->attr.dim_num > 3 ? inputs[0]->attr.size[3] : 1; + vsi_size_t output_batch = outputs[0]->attr.dim_num > 3 ? outputs[0]->attr.size[3] : 1; vsi_bool is_same_quant = FALSE; - vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, + vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, shapes[0], &rank[0]); - vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, + vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, shapes[1], &rank[1]); - vsi_nn_kernel_optimize_1d_tensor_shape( (const int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, + vsi_nn_kernel_optimize_1d_tensor_shape( (const vsi_size_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, shapes[2], &rank[2]); for (i = 0; i < _INPUT_NUM; i++) { reshape_tensors[i] = vsi_nn_reshape_tensor( graph, - inputs[i], (uint32_t*)shapes[i], rank[i] ); + inputs[i], shapes[i], rank[i] ); } reshape_tensors[_INPUT_NUM] = vsi_nn_reshape_tensor( graph, - outputs[0], (uint32_t*)shapes[_INPUT_NUM], rank[_INPUT_NUM] ); + outputs[0], shapes[_INPUT_NUM], rank[_INPUT_NUM] ); - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[0]->attr.size, reshape_tensors[0]->attr.dim_num ) || input_batch != output_batch ) { goto final; diff --git a/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c b/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c index cc4c5f6..2b79fd8 100644 --- a/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/space2depth_internal_evis.c @@ -136,9 +136,9 @@ DEF_KERNEL_INITIALIZER(_space2depth_internal_initializer) CHECK_STATUS_FAIL_GOTO(status, OnError ); input_dims = (uint32_t)attr[0]->shape->size; - input_width = attr[0]->shape->data[0]; - input_height = attr[0]->shape->data[1]; - input_depth = input_dims > 2 ? attr[0]->shape->data[2] : 1; + input_width = (int32_t)(attr[0]->shape->data[0]); + input_height = (int32_t)(attr[0]->shape->data[1]); + input_depth = (int32_t)(input_dims > 2 ? attr[0]->shape->data[2] : 1); shaderParam.global_scale[0] = 1; shaderParam.global_scale[1] = 1; @@ -323,7 +323,7 @@ static vsi_nn_kernel_node_t _setup int32_t block_size_y = vsi_nn_kernel_param_get_int32( params, "block_size_y" ); int32_t opt_flg = (block_size_x == 2 && block_size_y == 1) ? 1 : 0; - if ( !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c b/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c index 01b6155..5a101e0 100644 --- a/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/spatial_transformer_evis.c @@ -156,7 +156,7 @@ DEF_KERNEL_INITIALIZER(_get_matrix_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t * attr = NULL; - vsi_int_array_t * in_shape = NULL; + vsi_size_array_t * in_shape = NULL; float theta[8] = {0}; float input_scale = 1.0f; float input_tail = 0; @@ -250,7 +250,7 @@ DEF_KERNEL_INITIALIZER(_warp_affine_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t * attr[2] = {NULL}; - vsi_int_array_t * out_shape = NULL; + vsi_size_array_t * out_shape = NULL; float input_scale = 1.0f; float input_tail = 0; float output_scale = 1.0f; @@ -512,7 +512,7 @@ static vsi_nn_kernel_node_t _setup // Check if gpu can support the size if( !vsi_nn_kernel_gpu_check_shape( - (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) + outputs[0]->attr.size, outputs[0]->attr.dim_num ) ) { return NULL; } @@ -537,7 +537,7 @@ static vsi_nn_kernel_node_t _setup attr.size[2] = attr.size[1] = 1; attr.dim_num = inputs[0]->attr.dim_num; tensors[1] = vsi_nn_reshape_tensor( graph, - tensors[0], (uint32_t*)attr.size, attr.dim_num ); + tensors[0], attr.size, attr.dim_num ); warp_affine_tensors[0] = inputs[0]; warp_affine_tensors[1] = tensors[1]; diff --git a/src/tim/vx/internal/src/kernel/evis/swish_evis.c b/src/tim/vx/internal/src/kernel/evis/swish_evis.c index 7b1dbb5..7240375 100644 --- a/src/tim/vx/internal/src/kernel/evis/swish_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/swish_evis.c @@ -37,7 +37,7 @@ #include "utils/vsi_nn_dtype_util.h" #include "utils/vsi_nn_math.h" #include "kernel/vsi_nn_kernel.h" -#include "libnnext/vx_lib_nnext.h" +#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h" __BEGIN_DECLS @@ -174,7 +174,7 @@ DEF_KERNEL_INITIALIZER(_swish_initializer) vx_float32 outputScale = 1.0f; vx_float32 logE = (vx_float32)(log10(exp(1.0f)) / log10(2.0f)); vsi_nn_kernel_tensor_attr_t *input_attr = NULL, *output_attr = NULL; - vsi_int_array_t *out_shape = NULL; + vsi_size_array_t *out_shape = NULL; uint32_t pack_key = 0; input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input); @@ -384,7 +384,7 @@ DEF_KERNEL_INITIALIZER(_hswish_initializer) vx_float32 outputZP = 0; vx_float32 outputScale = 1.0f; vsi_nn_kernel_tensor_attr_t *input_attr = NULL, *output_attr = NULL; - vsi_int_array_t *out_shape = NULL; + vsi_size_array_t *out_shape = NULL; uint32_t pack_key = 0; input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)input); @@ -642,22 +642,30 @@ static vsi_nn_kernel_node_t _setup { vsi_status status = VSI_FAILURE; vsi_nn_kernel_node_param_t node_params[_SWISH_PARAM_NUM] = {NULL}; - int32_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}}; - uint32_t new_rank = 0; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] ={0}; + vsi_size_t new_rank = 0; vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; int32_t swish_type = vsi_nn_kernel_param_get_int32( params, "type" ); float beta = 1.0f; + vsi_bool ret = FALSE; #if (VX_ACTIVATION_EXT_SUPPORT) if (VSI_NN_HW_EVIS_2 == graph->ctx->config.evis.ver) { return NULL; } #endif - vsi_nn_OptimizedEltOPShape(inputs[0], (uint32_t *)(shapes[0]), &new_rank); - vsi_nn_OptimizedEltOPShape(outputs[0], (uint32_t *)(shapes[1]), &new_rank); + ret = vsi_nn_kernel_optimize_element_shape( + inputs[0]->attr.size, inputs[0]->attr.dim_num, + shape, &new_rank ); - if( !vsi_nn_kernel_gpu_check_shape( shapes[0], new_rank ) ) + if( ret ) + { + node_params[0] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shape, new_rank ); + node_params[1] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shape, new_rank ); + } + + if( !vsi_nn_kernel_gpu_check_shape( shape, new_rank ) ) { return NULL; } @@ -679,18 +687,24 @@ static vsi_nn_kernel_node_t _setup node = vsi_nn_kernel_create_node( graph, kernel ); if( node ) { - node_params[0] = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], new_rank ); - node_params[1] = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[1], new_rank ); node_params[2] = vsi_nn_kernel_scalar_create( graph, F32, &beta ); /* Pass parameters to node. */ status = vsi_nn_kernel_node_pass_param( node, node_params, _SWISH_PARAM_NUM ); VSI_ASSERT( status == VSI_SUCCESS ); - vsi_nn_kernel_tensor_release( &node_params[0] ); - vsi_nn_kernel_tensor_release( &node_params[1] ); - vsi_nn_kernel_scalar_release( &node_params[2] ); - } } + if(node_params[0]) + { + vsi_nn_kernel_tensor_release( &node_params[0] ); + } + if(node_params[1]) + { + vsi_nn_kernel_tensor_release( &node_params[1] ); + } + if(node_params[2]) + { + vsi_nn_kernel_scalar_release( &node_params[2] ); + } return node; } /* _setup() */ diff --git a/src/tim/vx/internal/src/kernel/evis/tensorstackconcat_evis.c b/src/tim/vx/internal/src/kernel/evis/tensorstackconcat_evis.c index cb97d0b..1585452 100644 --- a/src/tim/vx/internal/src/kernel/evis/tensorstackconcat_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/tensorstackconcat_evis.c @@ -120,7 +120,7 @@ DEF_KERNEL_INITIALIZER(_tensorstackconcat_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t * input_attr = NULL; - vsi_int_array_t * in_shape = NULL; + vsi_size_array_t * in_shape = NULL; // Add initializer input_attr = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] ); diff --git a/src/tim/vx/internal/src/kernel/evis/tile_evis.c b/src/tim/vx/internal/src/kernel/evis/tile_evis.c index 5f3465b..b9e46cd 100644 --- a/src/tim/vx/internal/src/kernel/evis/tile_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/tile_evis.c @@ -264,7 +264,7 @@ DEF_KERNEL_INITIALIZER(_tile_initializer) {0, 0, 0} }; vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL, NULL }; - vsi_int_array_t * in_shape = NULL; + vsi_size_array_t * in_shape = NULL; uint32_t pack_key; int32_t lastWorkItem = 0; float scaleIn = 1.0f; @@ -449,9 +449,9 @@ static vsi_status _query_kernel return status; } /* _query_kernel() */ -static vsi_bool _is_supported_axis(int32_t* multiples, uint32_t multiples_num) +static vsi_bool _is_supported_axis(vsi_size_t* multiples, vsi_size_t multiples_num) { - uint32_t i = 0; + vsi_size_t i = 0; if ( multiples_num < 4) { @@ -489,13 +489,13 @@ static vsi_nn_kernel_node_t _setup vsi_bool image_2d = FALSE; vsi_nn_kernel_node_t node = NULL; vx_uint32 remainder = inputs[0]->attr.size[0] % 8; - int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; uint32_t i = 0; - uint32_t new_rank = 0; + vsi_size_t new_rank = 0; vsi_bool ret = FALSE; uint32_t dim = inputs[0]->attr.dim_num; - int32_t multiples[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t multiples[VSI_NN_MAX_DIM_NUM] = { 0 }; for ( i = 0; i < dim; i++) { @@ -503,9 +503,9 @@ static vsi_nn_kernel_node_t _setup } ret = vsi_nn_kernel_optimize_tile_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, - (int32_t *)multiples, inputs[0]->attr.dim_num, - (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + inputs[0]->attr.size, inputs[0]->attr.dim_num, + multiples, inputs[0]->attr.dim_num, + outputs[0]->attr.size, outputs[0]->attr.dim_num, shapes[0], shapes[1], shapes[2], &new_rank ); if (ret) @@ -516,16 +516,16 @@ static vsi_nn_kernel_node_t _setup } reshape_tensors[0] = vsi_nn_reshape_tensor( graph, - inputs[0], (uint32_t*)shapes[0], new_rank ); + inputs[0], shapes[0], new_rank ); reshape_tensors[1] = vsi_nn_reshape_tensor( graph, - outputs[0], (uint32_t*)shapes[2], new_rank ); + outputs[0], shapes[2], new_rank ); } else { return NULL; } - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)reshape_tensors[1]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( reshape_tensors[1]->attr.size, outputs[0]->attr.dim_num )) { goto final; @@ -540,9 +540,9 @@ static vsi_nn_kernel_node_t _setup if( node ) { /* Pass parameters to node. */ - uint32_t depthIn = new_rank > 2 ? reshape_tensors[0]->attr.size[2] : 1; - uint32_t depthOut = new_rank > 2 ? reshape_tensors[1]->attr.size[2] : 1; - uint32_t batchIn = new_rank > 3 ? reshape_tensors[0]->attr.size[3] : 1; + vsi_size_t depthIn = new_rank > 2 ? reshape_tensors[0]->attr.size[2] : 1; + vsi_size_t depthOut = new_rank > 2 ? reshape_tensors[1]->attr.size[2] : 1; + vsi_size_t batchIn = new_rank > 3 ? reshape_tensors[0]->attr.size[3] : 1; vsi_nn_kernel_node_pack_io( node_params, _EVIS_PARAM_NUM, diff --git a/src/tim/vx/internal/src/kernel/evis/upsample_evis.c b/src/tim/vx/internal/src/kernel/evis/upsample_evis.c index df549fd..0ac1b6d 100644 --- a/src/tim/vx/internal/src/kernel/evis/upsample_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/upsample_evis.c @@ -147,7 +147,7 @@ DEF_KERNEL_INITIALIZER(_upsample_initializer) vsi_nn_kernel_tensor_attr_t *input_attr = NULL; vsi_nn_kernel_tensor_attr_t *output_attr = NULL; vsi_nn_kernel_tensor_attr_t *axis_attr = NULL; - vsi_int_array_t * input_shape = NULL; + vsi_size_array_t * input_shape = NULL; vsi_nn_kernel_dtype_e src_dtype = F16; vsi_nn_kernel_dtype_e dst_dtype = F16; vsi_nn_kernel_dtype_e axis_dtype = F16; @@ -871,11 +871,11 @@ static vsi_nn_kernel_node_t _setup return NULL; } - if( !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[0]->attr.size, + if( !vsi_nn_kernel_gpu_check_shape( inputs[0]->attr.size, inputs[0]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)inputs[1]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( inputs[1]->attr.size, inputs[1]->attr.dim_num ) - || !vsi_nn_kernel_gpu_check_shape( (int32_t*)outputs[0]->attr.size, + || !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size, outputs[0]->attr.dim_num )) { return NULL; diff --git a/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c b/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c index 5d89b18..27a478b 100644 --- a/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c +++ b/src/tim/vx/internal/src/kernel/evis/upsamplescale_evis.c @@ -138,7 +138,7 @@ DEF_KERNEL_INITIALIZER(_upsamplescale_initializer) }; vsi_nn_kernel_tensor_attr_t * output_attr = NULL; vsi_nn_kernel_tensor_attr_t * input_attr = NULL; - vsi_int_array_t * in_shape = NULL; + vsi_size_array_t * in_shape = NULL; vsi_nn_kernel_dtype_e input_dtype = F16; vsi_nn_kernel_dtype_e output_dtype = F16; int32_t stride = 0; diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c index 5eefb9c..d954dc0 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c @@ -705,11 +705,15 @@ vsi_status vsi_nn_kernel_node_pass_param vsi_nn_kernel_tensor_t vsi_nn_kernel_tensor_reshape ( vsi_nn_kernel_tensor_t tensor, - int32_t* shape, - uint32_t rank + vsi_size_t* shape, + vsi_size_t rank ) { +#ifdef VSI_40BIT_VA_SUPPORT return (vsi_nn_kernel_tensor_t)vxReshapeTensor((vx_tensor)tensor, shape, rank); +#else + return (vsi_nn_kernel_tensor_t)vxReshapeTensor((vx_tensor)tensor, (vx_int32*)shape, (vx_uint32)rank); +#endif } /* vsi_nn_kernel_tensor_reshape() */ void vsi_nn_kernel_tensor_release @@ -982,11 +986,11 @@ vsi_nn_kernel_node_t vsi_nn_kernel_selector } /* vsi_nn_kernel_selector() */ vsi_bool vsi_nn_kernel_gpu_check_shape - ( const int32_t * shape, size_t rank ) + ( const vsi_size_t * shape, vsi_size_t rank ) { - size_t i; + vsi_size_t i; vsi_bool ret = TRUE; - const size_t channel_dim = 2; + const vsi_size_t channel_dim = 2; for( i = 0; i < vsi_nn_min(rank, channel_dim); i++ ) { if( shape[i] == 0 @@ -1090,7 +1094,7 @@ vsi_nn_kernel_tensor_attr_t * vsi_nn_kernel_tensor_attr_create { vsi_nn_kernel_tensor_attr_t * attr; vsi_status status; - uint32_t dim_num; + vsi_size_t dim_num; vsi_nn_type_e dtype = VSI_NN_TYPE_FLOAT16; vsi_nn_qnt_type_e quant_type = VSI_NN_QNT_TYPE_NONE; attr = (vsi_nn_kernel_tensor_attr_t *)malloc( @@ -1103,15 +1107,15 @@ vsi_nn_kernel_tensor_attr_t * vsi_nn_kernel_tensor_attr_create memset( attr, 0, sizeof(vsi_nn_kernel_tensor_attr_t) ); status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_NUM_OF_DIMS, - &dim_num, sizeof(uint32_t)); + &dim_num, sizeof(dim_num)); CHECK_STATUS( status ); if( status == VSI_SUCCESS ) { - vsi_int_array_t * shape = vsi_int_array_create( dim_num ); + vsi_size_array_t * shape = vsi_size_array_create( dim_num ); if( shape ) { status = vxQueryTensor( (vx_tensor)tensor, VX_TENSOR_DIMS, - shape->data, sizeof(int32_t) * dim_num); + shape->data, sizeof(shape->data[0]) * dim_num); attr->shape = shape; CHECK_STATUS( status ); } @@ -1165,7 +1169,7 @@ void vsi_nn_kernel_tensor_attr_release if( p_attr && *p_attr ) { vsi_nn_kernel_tensor_attr_t * attr = *p_attr; - vsi_int_array_release( &attr->shape ); + vsi_size_array_release( &attr->shape ); if( attr->quant == VSI_NN_KERNEL_QUANT_ASYMM_PERCHANNEL ) { vsi_float_array_release( &attr->asymm_v.scale ); diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c index 53597b9..ecbdccf 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_eltwise.c @@ -42,28 +42,20 @@ static size_t vsi_nn_compute_element_num ( const int32_t* shape, const size_t rank); #endif -static size_t eltwise_fill_dim +static vsi_size_t eltwise_fill_dim ( - int32_t* shape_x, int32_t* shape_y, - int32_t* shape_output, size_t rank, - size_t max_rank, int32_t size_x, int32_t size_y, - int32_t size_output + vsi_size_t* shape_x, vsi_size_t* shape_y, + vsi_size_t* shape_output, vsi_size_t rank, + vsi_size_t max_rank, vsi_size_t size_x, vsi_size_t size_y, + vsi_size_t size_output ); static vsi_bool compute_gpu_divisor ( - const int32_t input_value, - const int32_t limit, + const vsi_size_t input_value, + const vsi_size_t limit, const int32_t gcd, - int32_t* divisor - ); - -static size_t eltwise_fill_dim - ( - int32_t* shape_x, int32_t* shape_y, - int32_t* shape_output, size_t rank, - size_t max_rank, int32_t size_x, int32_t size_y, - int32_t size_output + vsi_size_t* divisor ); #if 0 @@ -82,13 +74,13 @@ static size_t vsi_nn_compute_element_num static vsi_bool compute_gpu_divisor ( - const int32_t input_value, - const int32_t limit, + const vsi_size_t input_value, + const vsi_size_t limit, const int32_t gcd, - int32_t* divisor + vsi_size_t* divisor ) { - int32_t i = 0; + vsi_size_t i = 0; for( i = vsi_nn_min( input_value, limit - 1 ); i > 0; i -- ) { if( ( i % gcd == 0 ) && ( input_value % i == 0 ) ) @@ -100,17 +92,16 @@ static vsi_bool compute_gpu_divisor return FALSE; } /* compute_gpu_divisor */ -static size_t eltwise_fill_dim +static vsi_size_t eltwise_fill_dim ( - int32_t* shape_x, int32_t* shape_y, - int32_t* shape_output, size_t rank, - size_t max_rank, int32_t size_x, int32_t size_y, - int32_t size_output + vsi_size_t* shape_x, vsi_size_t* shape_y, + vsi_size_t* shape_output, vsi_size_t rank, + vsi_size_t max_rank, vsi_size_t size_x, vsi_size_t size_y, + vsi_size_t size_output ) { - size_t cost_size = 1; + vsi_size_t cost_size = 1; VSI_ASSERT( rank <= max_rank ); - VSI_ASSERT( size_output >= (int32_t)((int64_t)(0xFFFFFFFF) - 1) ); if( size_output < GPU_TENSOR_MAX_WIDTH ) { shape_x[rank] = size_x; @@ -119,8 +110,8 @@ static size_t eltwise_fill_dim } else { - int32_t divisor = 0; - int32_t remainder = 0; + vsi_size_t divisor = 0; + vsi_size_t remainder = 0; compute_gpu_divisor( size_output, GPU_TENSOR_MAX_WIDTH, 1, &divisor ); remainder = size_output / divisor; if( remainder > GPU_TENSOR_MAX_WIDTH || rank >= max_rank ) @@ -166,22 +157,22 @@ static size_t eltwise_fill_dim vsi_bool vsi_nn_kernel_optimize_eltwise_shape ( - const int32_t* shape_x, const size_t rank_x, - const int32_t* shape_y, const size_t rank_y, - const int32_t* shape_output, const size_t rank_output, - int32_t* out_shape_x, int32_t* out_shape_y, - int32_t* out_shape_output, uint32_t* out_rank_output + const vsi_size_t* shape_x, const vsi_size_t rank_x, + const vsi_size_t* shape_y, const vsi_size_t rank_y, + const vsi_size_t* shape_output, const vsi_size_t rank_output, + vsi_size_t* out_shape_x, vsi_size_t* out_shape_y, + vsi_size_t* out_shape_output, vsi_size_t* out_rank_output ) { vsi_bool ret = TRUE; vsi_bool append_dim = FALSE; - size_t i = 0; - size_t dims = 0; - int32_t effective_size_x = 1; - int32_t effective_size_y = 1; - int32_t tmp_sz = 0; - int32_t sx = 0; - int32_t sy = 0; + vsi_size_t i = 0; + vsi_size_t dims = 0; + vsi_size_t effective_size_x = 1; + vsi_size_t effective_size_y = 1; + vsi_size_t tmp_sz = 0; + vsi_size_t sx = 0; + vsi_size_t sy = 0; eltwise_broadcast_state_e state = ELTWISE_BROADCAST_STATE_EMPTY; eltwise_broadcast_state_e prv_state = ELTWISE_BROADCAST_STATE_EMPTY; @@ -325,11 +316,11 @@ vsi_bool vsi_nn_kernel_optimize_eltwise_shape } /* For debug */ #if DEBUG - vsi_nn_print_int_array( out_shape_x, dims ); - vsi_nn_print_int_array( out_shape_y, dims ); - vsi_nn_print_int_array( out_shape_output, dims ); + vsi_nn_print_size_array( out_shape_x, dims ); + vsi_nn_print_size_array( out_shape_y, dims ); + vsi_nn_print_size_array( out_shape_output, dims ); #endif - *out_rank_output = (uint32_t)dims; + *out_rank_output = (size_t)dims; } #undef _swap_size return ret; @@ -337,18 +328,17 @@ vsi_bool vsi_nn_kernel_optimize_eltwise_shape -static size_t broadcast_fill_dim +static vsi_size_t broadcast_fill_dim ( - int32_t** shape_in, int32_t input_num, - int32_t* shape_output, size_t rank, - size_t max_rank, int32_t* size_in, - int32_t size_output + vsi_size_t** shape_in, int32_t input_num, + vsi_size_t* shape_output, vsi_size_t rank, + vsi_size_t max_rank, vsi_size_t* size_in, + vsi_size_t size_output ) { int32_t i = 0; - size_t cost_size = 1; + vsi_size_t cost_size = 1; VSI_ASSERT( rank <= max_rank ); - VSI_ASSERT( size_output >= (int32_t)((int64_t)(0xFFFFFFFF) - 1) ); if( size_output < GPU_TENSOR_MAX_WIDTH ) { for (i = 0; i < input_num; i++) @@ -359,8 +349,8 @@ static size_t broadcast_fill_dim } else { - int32_t divisor = 0; - int32_t remainder = 0; + vsi_size_t divisor = 0; + vsi_size_t remainder = 0; compute_gpu_divisor( size_output, GPU_TENSOR_MAX_WIDTH, 1, &divisor ); remainder = size_output / divisor; if( remainder > GPU_TENSOR_MAX_WIDTH || rank >= max_rank ) @@ -401,23 +391,23 @@ static size_t broadcast_fill_dim vsi_bool vsi_nn_kernel_optimize_broadcast_shape ( - const int32_t** shape_in, const size_t* rank_in, + const vsi_size_t** shape_in, const vsi_size_t* rank_in, const int32_t input_num, - const int32_t* shape_output, const size_t rank_output, - int32_t** out_shape_in, - int32_t* out_shape_output, uint32_t* out_rank_output + const vsi_size_t* shape_output, const vsi_size_t rank_output, + vsi_size_t** out_shape_in, + vsi_size_t* out_shape_output, uint32_t* out_rank_output ) { #define MAX_INPUT_NUM 30 vsi_bool ret = TRUE; vsi_bool append_dim = FALSE; - size_t i = 0; - size_t j = 0; - size_t k = 0; - size_t dims = 0; - int32_t effective_size[MAX_INPUT_NUM] = {1}; - int32_t tmp_sz = 0; - int32_t size_in[MAX_INPUT_NUM] = {0}; + vsi_size_t i = 0; + vsi_size_t j = 0; + vsi_size_t k = 0; + vsi_size_t dims = 0; + vsi_size_t effective_size[MAX_INPUT_NUM] = {1}; + vsi_size_t tmp_sz = 0; + vsi_size_t size_in[MAX_INPUT_NUM] = {0}; int32_t state_mask = 0; int32_t prv_state_mask = -1; @@ -436,14 +426,14 @@ vsi_bool vsi_nn_kernel_optimize_broadcast_shape goto final; } - for (i = 0; i < (size_t)input_num; i++) + for (i = 0; i < (vsi_size_t)input_num; i++) { effective_size[i] = 1; } for( i = 0; i < rank_output; i++ ) { - for (j = 0; j < (size_t)input_num; j++) + for (j = 0; j < (vsi_size_t)input_num; j++) { size_in[j] = i < rank_in[j] ? shape_in[j][i] : 1; } @@ -457,7 +447,7 @@ vsi_bool vsi_nn_kernel_optimize_broadcast_shape // Invalid shape for broadcasting k = 0; - for (j = 0; j < (size_t)input_num; j++) + for (j = 0; j < (vsi_size_t)input_num; j++) { if (size_in[j] > 1) { @@ -466,7 +456,7 @@ vsi_bool vsi_nn_kernel_optimize_broadcast_shape } } - for (j = 0; j < (uint32_t)input_num; j++) + for (j = 0; j < (vsi_size_t)input_num; j++) { if ((size_in[k] != size_in[j]) && (size_in[j] > 1)) @@ -477,7 +467,7 @@ vsi_bool vsi_nn_kernel_optimize_broadcast_shape } state_mask = 0; - for (j = 0; j < (size_t)input_num; j++) + for (j = 0; j < (vsi_size_t)input_num; j++) { if (1 == size_in[j]) { @@ -489,14 +479,14 @@ vsi_bool vsi_nn_kernel_optimize_broadcast_shape if ((-1 == prv_state_mask) || (state_mask == prv_state_mask)) { - for (j = 0; j < (size_t)input_num; j++) + for (j = 0; j < (vsi_size_t)input_num; j++) { effective_size[j] *= size_in[j]; } } else { - for (j = 0; j < (size_t)input_num; j++) + for (j = 0; j < (vsi_size_t)input_num; j++) { _swap_size(size_in[j], effective_size[j], tmp_sz); } @@ -507,9 +497,9 @@ vsi_bool vsi_nn_kernel_optimize_broadcast_shape if( append_dim ) { - int32_t size_output; + vsi_size_t size_output; size_output = size_in[0]; - for (j = 1; j < (size_t)input_num; j++) + for (j = 1; j < (vsi_size_t)input_num; j++) { size_output = vsi_nn_max(size_output, size_in[j]); } @@ -523,7 +513,7 @@ vsi_bool vsi_nn_kernel_optimize_broadcast_shape /* Append the last dim */ if( i == rank_output ) { - int32_t size_output; + vsi_size_t size_output; size_output = effective_size[0]; for (j = 1; j < (size_t)input_num; j++) { @@ -535,7 +525,7 @@ vsi_bool vsi_nn_kernel_optimize_broadcast_shape /* Avoid 1D shape*/ if( 1 == dims ) { - for (j = 0; j < (size_t)input_num; j++) + for (j = 0; j < (vsi_size_t)input_num; j++) { out_shape_in[j][1] = 1; } @@ -544,7 +534,7 @@ vsi_bool vsi_nn_kernel_optimize_broadcast_shape } else { - for (j = 0; j < (size_t)input_num; j++) + for (j = 0; j < (vsi_size_t)input_num; j++) { for ( i = 0; i < dims; i++) { diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c index ef35bf7..da0de6e 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_gpu_shape_optimize.c @@ -31,27 +31,27 @@ static vsi_bool compute_gpu_divisor ( - const int32_t input_value, - const int32_t limit, + const vsi_size_t input_value, + const vsi_size_t limit, const int32_t gcd, - int32_t* divisor + vsi_size_t* divisor ); -static size_t element_fill_dim +static vsi_size_t element_fill_dim ( - int32_t* shape_x, size_t rank_x, - size_t max_rank, int32_t size_x + vsi_size_t* shape_x, vsi_size_t rank_x, + vsi_size_t max_rank, vsi_size_t size_x ); static vsi_bool compute_gpu_divisor ( - const int32_t input_value, - const int32_t limit, + const vsi_size_t input_value, + const vsi_size_t limit, const int32_t gcd, - int32_t* divisor + vsi_size_t* divisor ) { - int32_t i = 0; + vsi_size_t i = 0; for( i = vsi_nn_min( input_value, limit - 1 ); i > 0; i -- ) { if ( ( i % gcd == 0 ) && ( input_value % i == 0 ) ) @@ -63,15 +63,14 @@ static vsi_bool compute_gpu_divisor return FALSE; } /* compute_gpu_divisor */ -static size_t element_fill_dim +static vsi_size_t element_fill_dim ( - int32_t* shape_x, size_t rank_x, - size_t max_rank, int32_t size_x + vsi_size_t* shape_x, vsi_size_t rank_x, + vsi_size_t max_rank, vsi_size_t size_x ) { - size_t cost_size = 1; + vsi_size_t cost_size = 1; VSI_ASSERT( rank_x <= max_rank ); - VSI_ASSERT( size_x >= (int32_t)((int64_t)(0xFFFFFFFF) - 1) ); if (size_x == 1) return 0; @@ -82,8 +81,8 @@ static size_t element_fill_dim } else { - int32_t divisor = 0; - int32_t remainder = 0; + vsi_size_t divisor = 0; + vsi_size_t remainder = 0; compute_gpu_divisor( size_x, GPU_TENSOR_MAX_WIDTH, 1, &divisor ); remainder = size_x / divisor; if ( remainder > GPU_TENSOR_MAX_WIDTH || rank_x >= max_rank) @@ -116,22 +115,22 @@ static size_t element_fill_dim /*only for continuous axises or one axis*/ vsi_bool vsi_nn_kernel_optimize_reduce_shape ( - const int32_t* shape_x, const size_t rank_x, - const int32_t *axis, const size_t axis_size, - const int32_t* shape_output, const size_t rank_output, - int32_t* out_shape_x, uint32_t* out_rank_x, - int32_t* out_shape_output, uint32_t* out_rank_output, + const vsi_size_t* shape_x, const vsi_size_t rank_x, + const int32_t *axis, const vsi_size_t axis_size, + const vsi_size_t* shape_output, const vsi_size_t rank_output, + vsi_size_t* out_shape_x, uint32_t* out_rank_x, + vsi_size_t* out_shape_output, uint32_t* out_rank_output, int32_t* out_axis, uint32_t* out_axis_size ) { vsi_bool ret = TRUE; - size_t i = 0; - size_t rank_in = 0; - size_t rank_out = 0; - size_t dims = 0; - int32_t innerSize = 1; - int32_t outerSize = 1; - int32_t axisSize = 1; + vsi_size_t i = 0; + vsi_size_t rank_in = 0; + vsi_size_t rank_out = 0; + vsi_size_t dims = 0; + vsi_size_t innerSize = 1; + vsi_size_t outerSize = 1; + vsi_size_t axisSize = 1; for (i = 0; i < axis_size; i++) { @@ -203,19 +202,19 @@ vsi_bool vsi_nn_kernel_optimize_reduce_shape vsi_bool vsi_nn_kernel_optimize_tensor_shape ( - const int32_t* shape_x, const size_t rank_x, - const int32_t *axis, const size_t axis_size, - int32_t* out_shape_x, uint32_t* out_rank_x, + const vsi_size_t* shape_x, const vsi_size_t rank_x, + const int32_t *axis, const vsi_size_t axis_size, + vsi_size_t* out_shape_x, uint32_t* out_rank_x, int32_t* out_axis, uint32_t* out_axis_size ) { vsi_bool ret = TRUE; - size_t i = 0; - size_t rank_in = 0; - size_t dims = 0; - int32_t innerSize = 1; - int32_t outerSize = 1; - int32_t axisSize = 1; + vsi_size_t i = 0; + vsi_size_t rank_in = 0; + vsi_size_t dims = 0; + vsi_size_t innerSize = 1; + vsi_size_t outerSize = 1; + vsi_size_t axisSize = 1; for (i = 0; i < axis_size; i++) { @@ -272,14 +271,14 @@ vsi_bool vsi_nn_kernel_optimize_tensor_shape vsi_bool vsi_nn_kernel_optimize_element_shape ( - const int32_t* shape_x, const size_t rank_x, - int32_t* out_shape_x, int32_t* out_rank_x + const vsi_size_t* shape_x, const vsi_size_t rank_x, + vsi_size_t* out_shape_x, vsi_size_t* out_rank_x ) { vsi_bool ret = TRUE; uint32_t i = 0; - size_t rank_in = 0; - int32_t element_num = 1; + vsi_size_t rank_in = 0; + vsi_size_t element_num = 1; for (i = 0; i < rank_x; i++) { @@ -300,24 +299,24 @@ vsi_bool vsi_nn_kernel_optimize_element_shape rank_in = 2; } - *out_rank_x = (int32_t)rank_in; + *out_rank_x = (size_t)rank_in; return ret; } /* vsi_nn_kernel_optimize_element_shape() */ vsi_bool vsi_nn_kernel_optimize_softmax_shape ( - const int32_t* shape_x, const size_t rank_x, const int32_t axis, - int32_t* out_shape_x, uint32_t* out_rank_x,int32_t* out_axis + const vsi_size_t* shape_x, const vsi_size_t rank_x, const int32_t axis, + vsi_size_t* out_shape_x, uint32_t* out_rank_x,int32_t* out_axis ) { vsi_bool ret = TRUE; - size_t i = 0; - size_t rank_in = 0; - size_t dims = 0; - int32_t innerSize = 1; - int32_t outerSize = 1; - int32_t axisSize = shape_x[axis]; + vsi_size_t i = 0; + vsi_size_t rank_in = 0; + vsi_size_t dims = 0; + vsi_size_t innerSize = 1; + vsi_size_t outerSize = 1; + vsi_size_t axisSize = shape_x[axis]; for (i = 0; i < (size_t)axis; i++) { @@ -372,17 +371,16 @@ typedef enum TILE_STATE_EMPTY = 8, } tile_axis_state_e; -static size_t tile_fill_dim +static vsi_size_t tile_fill_dim ( - int32_t* shape_x, int32_t* shape_y, - int32_t* shape_output, size_t rank, - size_t max_rank, int32_t size_x, int32_t size_y, - int32_t size_output + vsi_size_t* shape_x, vsi_size_t* shape_y, + vsi_size_t* shape_output, vsi_size_t rank, + vsi_size_t max_rank, vsi_size_t size_x, vsi_size_t size_y, + vsi_size_t size_output ) { - size_t cost_size = 1; + vsi_size_t cost_size = 1; VSI_ASSERT( rank <= max_rank ); - VSI_ASSERT( size_output >= (int32_t)((int64_t)(0xFFFFFFFF) - 1) ); if ( size_output < GPU_TENSOR_MAX_WIDTH ) { shape_x[rank] = size_x; @@ -391,8 +389,8 @@ static size_t tile_fill_dim } else { - int32_t divisor = 0; - int32_t remainder = 0; + vsi_size_t divisor = 0; + vsi_size_t remainder = 0; compute_gpu_divisor( size_output, GPU_TENSOR_MAX_WIDTH, 1, &divisor ); remainder = size_output / divisor; if ( remainder > GPU_TENSOR_MAX_WIDTH || rank >= max_rank ) @@ -438,23 +436,23 @@ static size_t tile_fill_dim vsi_bool vsi_nn_kernel_optimize_tile_shape ( - const int32_t* shape_x, const size_t rank_x, - const int32_t* multiples, const size_t rank, - const int32_t* shape_output, const size_t rank_output, - int32_t* out_shape_x, int32_t* out_shape_y, - int32_t* out_shape_output, uint32_t* out_rank_output + const vsi_size_t* shape_x, const vsi_size_t rank_x, + const vsi_size_t* multiples, const vsi_size_t rank, + const vsi_size_t* shape_output, const vsi_size_t rank_output, + vsi_size_t* out_shape_x, vsi_size_t* out_shape_y, + vsi_size_t* out_shape_output, vsi_size_t* out_rank_output ) { vsi_bool ret = TRUE; vsi_bool append_dim = FALSE; - size_t i = 0; - size_t dims = 0; - int32_t effective_size_x = 1; - int32_t effective_size_y = 1; - int32_t effective_size_z = 1; - int32_t sx = 0; - int32_t sy = 0; - int32_t sz = 0; + vsi_size_t i = 0; + vsi_size_t dims = 0; + vsi_size_t effective_size_x = 1; + vsi_size_t effective_size_y = 1; + vsi_size_t effective_size_z = 1; + vsi_size_t sx = 0; + vsi_size_t sy = 0; + vsi_size_t sz = 0; tile_axis_state_e state = TILE_STATE_EMPTY; tile_axis_state_e next_state = TILE_STATE_EMPTY; @@ -569,9 +567,9 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape } /* For debug */ #if DEBUG - vsi_nn_print_int_array( out_shape_x, dims ); - vsi_nn_print_int_array( out_shape_y, dims ); - vsi_nn_print_int_array( out_shape_output, dims ); + vsi_nn_print_size_array( out_shape_x, dims ); + vsi_nn_print_size_array( out_shape_y, dims ); + vsi_nn_print_size_array( out_shape_output, dims ); #endif *out_rank_output = (uint32_t)dims; } @@ -581,11 +579,11 @@ vsi_bool vsi_nn_kernel_optimize_tile_shape vsi_bool vsi_nn_kernel_optimize_1d_tensor_shape ( - const int32_t* shape, const uint32_t rank, - int32_t* out_shape, uint32_t* out_rank + const vsi_size_t* shape, const uint32_t rank, + vsi_size_t* out_shape, uint32_t* out_rank ) { - memcpy(out_shape, shape, sizeof(int32_t) * rank); + memcpy(out_shape, shape, sizeof(vsi_size_t) * rank); *out_rank = vsi_nn_max(rank, 2); out_shape[1] = rank == 1 ? 1 : out_shape[1]; @@ -595,8 +593,8 @@ vsi_bool vsi_nn_kernel_optimize_1d_tensor_shape vsi_bool vsi_nn_kernel_optimize_nchw2xhw_shape ( - const int32_t* shape, const uint32_t rank, - int32_t* out_shape, uint32_t* out_rank + const vsi_size_t* shape, const uint32_t rank, + vsi_size_t* out_shape, uint32_t* out_rank ) { uint32_t dim_num = 0; diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c index 2b98e93..2447239 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c @@ -58,8 +58,8 @@ KERNEL_SELECTOR( depthwise_conv1d ) ) { int32_t dilation = vsi_nn_kernel_param_get_int32( params, "dilation" ); - int32_t kernel = inputs[1]->attr.size[0]; - int32_t real_kernel = 0; + vsi_size_t kernel = inputs[1]->attr.size[0]; + vsi_size_t real_kernel = 0; int32_t stride = vsi_nn_kernel_param_get_int32( params, "stride" ); vsi_nn_kernel_pirority_t pirority[] = { { VSI_NN_KERNEL_TYPE_VX, 0 }, @@ -129,5 +129,6 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(relu_keras) REGISTER_VX_FIRST_KERNEL_SELECTOR(erf) REGISTER_VX_FIRST_KERNEL_SELECTOR(gelu) REGISTER_VX_FIRST_KERNEL_SELECTOR(hard_gelu) +REGISTER_VX_FIRST_KERNEL_SELECTOR(matrixmul) __END_DECLS diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c index 9ea24e5..15de948 100644 --- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c +++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c @@ -52,6 +52,7 @@ vsi_status _copy_tensor size_t start[VSI_NN_MAX_DIM_NUM] = { 0 }; size_t end[VSI_NN_MAX_DIM_NUM] = { 0 }; size_t stride[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t stride2[VSI_NN_MAX_DIM_NUM] = { 0 }; size_t type_bytes; size_t total_bytes; uint32_t i; @@ -69,13 +70,18 @@ vsi_status _copy_tensor } total_bytes = vsi_nn_kernel_tensor_attr_get_bytes( attr ); - if( total_bytes != buffer_size ) + if( total_bytes != (vsi_size_t)buffer_size ) { - VSILOGE("Read buffer size mismatch %d vs %d", total_bytes, buffer_size); + VSILOGE("Read buffer size mismatch %"VSI_SIZE_T_SPECIFIER" vs %"VSI_SIZE_T_SPECIFIER"", + total_bytes, (vsi_size_t)buffer_size); goto final; } - vsi_nn_shape_get_stride( attr->shape->data, attr->shape->size, stride ); + vsi_nn_shape_get_stride( attr->shape->data, (vsi_size_t)attr->shape->size, stride2 ); + for( i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + stride[i] = stride2[i]; + } type_bytes = vsi_nn_kernel_dtype_get_bytes( attr->dtype ); rank = attr->shape->size; for( i = 0; i < rank; i++ ) @@ -166,13 +172,13 @@ void * vsi_nn_kernel_tensor_create_buffer case VSI_NN_KERNEL_QUANT_DFP: vsi_nn_dtype_convert_quantize_dfp_to_float( buffer, tensor_size, attr->dtype, - attr->dfp.fl, out_buffer ); + attr->dfp.fl, (float*)out_buffer ); break; case VSI_NN_KERNEL_QUANT_ASYMM: vsi_nn_dtype_convert_quantize_asymm_to_float( buffer, tensor_size, attr->dtype, attr->asymm.scale, attr->asymm.zero_point, - out_buffer ); + (float*)out_buffer ); break; case VSI_NN_KERNEL_QUANT_SYMM_PERCHANNEL: vsi_nn_dtype_convert_quantize_symm_perchannel_to_float( @@ -183,7 +189,7 @@ void * vsi_nn_kernel_tensor_create_buffer attr->asymm_v.zero_point->data, attr->asymm_v.zero_point->size, attr->asymm_v.channel_dim, - out_buffer ); + (float*)out_buffer ); break; default: VSILOGE("Donot support quantize type %d", attr->quant); @@ -194,7 +200,7 @@ void * vsi_nn_kernel_tensor_create_buffer else { vsi_nn_dtype_convert_dtype_to_float( buffer, tensor_size, - attr->dtype, out_buffer ); + attr->dtype, (float*)out_buffer ); } free( buffer ); } @@ -407,7 +413,7 @@ static void _convert_tensor_attr_to_vx_tensor_param memset( p, 0, sizeof( vx_tensor_create_params_t ) ); p->num_of_dims = (uint32_t)attr->shape->size; - p->sizes = (uint32_t*)attr->shape->data; + p->sizes = attr->shape->data; #define MAP_TYPE( var, src_type, dst_type ) \ case src_type: \ var = dst_type; \ @@ -494,28 +500,28 @@ vsi_nn_tensor_t* vsi_nn_pad_tensor ( vsi_nn_graph_t * graph, vsi_nn_tensor_t * input, - int32_t * pad_front, - int32_t * pad_end, - size_t pad_size, + vsi_size_t * pad_front, + vsi_size_t * pad_end, + vsi_size_t pad_size, vsi_nn_pad_mode_e mode, float pad_value ) { - uint32_t sz = 0; + vsi_size_t sz = 0; vsi_nn_tensor_attr_t attr; float *input_data_ptr = NULL; float *output_data_ptr = NULL; float *src_ptr = NULL; float *dst_ptr = NULL; - int32_t i = 0; - int32_t out_w = 0; - int32_t out_h = 0; - int32_t out_d = 0; - int32_t out_b = 0; - int32_t output_width = 1; - int32_t output_height = 1; - int32_t output_depth = 1; - int32_t output_batch = 1; + vsi_size_t i = 0; + vsi_size_t out_w = 0; + vsi_size_t out_h = 0; + vsi_size_t out_d = 0; + vsi_size_t out_b = 0; + vsi_size_t output_width = 1; + vsi_size_t output_height = 1; + vsi_size_t output_depth = 1; + vsi_size_t output_batch = 1; vsi_nn_dtype_t dst_type; vsi_nn_tensor_t *output = NULL; @@ -524,10 +530,10 @@ vsi_nn_tensor_t* vsi_nn_pad_tensor memcpy(&attr, &input->attr, sizeof(vsi_nn_tensor_attr_t)); - for(i = 0; i < (int32_t)pad_size; i ++) + for(i = 0; i < pad_size; i ++) { - int32_t front = pad_front[i]; - int32_t back = pad_end[i]; + vsi_size_t front = pad_front[i]; + vsi_size_t back = pad_end[i]; attr.size[i] = front + back + attr.size[i]; } diff --git a/src/tim/vx/internal/src/kernel/vx/a_times_b_plus_c_vx.c b/src/tim/vx/internal/src/kernel/vx/a_times_b_plus_c_vx.c index 32fd5b8..6756e3a 100644 --- a/src/tim/vx/internal/src/kernel/vx/a_times_b_plus_c_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/a_times_b_plus_c_vx.c @@ -74,7 +74,7 @@ REGISTER_A_TIMES_B_PLUS_C_OPENVX_KERNEL( a_times_b_plus_c ) } memset(&attr, 0, sizeof(attr)); - memcpy(attr.size, outputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof( uint32_t )); + memcpy(attr.size, outputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); attr.dim_num = outputs[0]->attr.dim_num; attr.vtl = TRUE; attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; diff --git a/src/tim/vx/internal/src/kernel/vx/convolutional.c b/src/tim/vx/internal/src/kernel/vx/convolutional.c index 04e517c..2bb2248 100644 --- a/src/tim/vx/internal/src/kernel/vx/convolutional.c +++ b/src/tim/vx/internal/src/kernel/vx/convolutional.c @@ -121,22 +121,23 @@ static vsi_bool _build_vx_deconv2d_param } /* _build_vx_deconv2d_param() */ static vx_tensor _expand_tensor_dim - ( vx_tensor tensor, int32_t * shape, size_t rank, int32_t expand_dim ) + ( vx_tensor tensor, vsi_ssize_t * shape, size_t rank, vsi_ssize_t expand_dim ) { - int32_t new_shape[VSI_NN_MAX_DIM_NUM] = { 0 }; - uint32_t i, cnt; + vsi_ssize_t new_shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t i, cnt; if( expand_dim < 0 ) { - expand_dim = (int32_t)rank + expand_dim; + expand_dim = (vsi_ssize_t)rank + expand_dim; } - if( expand_dim < 0 || (uint32_t)expand_dim > rank ) + if( expand_dim < 0 || (vsi_size_t)expand_dim > rank ) { - VSILOGE("Run dim to expand %d, rank is %lu", expand_dim, rank); + VSILOGE("Run dim to expand %"VSI_SSIZE_T_SPECIFIER", rank is %"SIZE_T_SPECIFIER, + expand_dim, rank); return NULL; } for( i = 0, cnt = 0; i < rank; i ++ ) { - if( i == (uint32_t)expand_dim ) + if( i == (vsi_size_t)expand_dim ) { new_shape[cnt] = 1; cnt ++; @@ -148,7 +149,12 @@ static vx_tensor _expand_tensor_dim { new_shape[cnt] = 1; } - return vxReshapeTensor( tensor, new_shape, (uint32_t)rank + 1 ); +#ifdef VSI_40BIT_VA_SUPPORT + return vxReshapeTensor( tensor, (vsi_size_t*)new_shape, rank + 1 ); +#else + return vxReshapeTensor( tensor, (int32_t*)new_shape, (uint32_t)(rank + 1) ); +#endif + } /* _expand_tensor_dim() */ @@ -197,12 +203,12 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d ) ); temp_tensors[0] = _expand_tensor_dim( inputs[0]->t, - (int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 0 ); + (vsi_ssize_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 0 ); CHECK_PTR_FAIL_GOTO( temp_tensors[0], "Expand input dim fail.", final ); if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC) { temp_tensors[1] = _expand_tensor_dim( inputs[1]->t, - (int32_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 ); + (vsi_ssize_t*)inputs[1]->attr.size, inputs[1]->attr.dim_num, 0 ); CHECK_PTR_FAIL_GOTO( temp_tensors[1], "Expand kernel dim fail.", final ); } else @@ -229,7 +235,7 @@ REGISTER_CONV_OPENVX_KERNEL( conv1d ) } temp_tensors[2] = _expand_tensor_dim( outputs[0]->t, - (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, 0 ); + (vsi_ssize_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, 0 ); CHECK_PTR_FAIL_GOTO( temp_tensors[2], "Expand output dim fail.", final ); node = vxConvolutionLayer( graph->g, @@ -272,12 +278,12 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d ) ); temp_tensors[0] = _expand_tensor_dim( inputs[0]->t, - (int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 0 ); + (vsi_ssize_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 0 ); CHECK_PTR_FAIL_GOTO( temp_tensors[0], "Expand input dim fail.", final ); if (inputs[1]->attr.dtype.qnt_type != VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC) { - int32_t new_w_shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t new_w_shape[VSI_NN_MAX_DIM_NUM] = { 0 }; uint32_t new_w_rank = 4; new_w_shape[0] = 1; new_w_shape[1] = inputs[1]->attr.size[0]; @@ -287,7 +293,12 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d ) new_w_shape[2] *= inputs[1]->attr.size[i]; } new_w_shape[3] = 1; +#ifdef VSI_40BIT_VA_SUPPORT temp_tensors[1] = vxReshapeTensor( inputs[1]->t, new_w_shape, new_w_rank ); +#else + temp_tensors[1] = vxReshapeTensor( inputs[1]->t, (vx_int32*)new_w_shape, (vx_uint32)new_w_rank ); +#endif + CHECK_PTR_FAIL_GOTO( temp_tensors[1], "Expand kernel dim fail.", final ); } else @@ -318,7 +329,7 @@ REGISTER_CONV_OPENVX_KERNEL( depthwise_conv1d ) } temp_tensors[2] = _expand_tensor_dim( outputs[0]->t, - (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, 0 ); + (vsi_ssize_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, 0 ); CHECK_PTR_FAIL_GOTO( temp_tensors[2], "Expand output dim fail.", final ); if( need_explicit_padding ) @@ -450,11 +461,11 @@ REGISTER_CONV_OPENVX_KERNEL( deconvolution1d ) ); temp_tensors[0] = _expand_tensor_dim( inputs[0]->t, - (int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 1 ); + (vsi_ssize_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, 1 ); CHECK_PTR_FAIL_GOTO( temp_tensors[0], "Expand input dim fail.", final ); temp_tensors[1] = _expand_tensor_dim( outputs[0]->t, - (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, 1 ); + (vsi_ssize_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, 1 ); CHECK_PTR_FAIL_GOTO( temp_tensors[1], "Expand output dim fail.", final ); node = vxDeconvolutionLayer( graph->g, diff --git a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c index 977be07..492f8f7 100644 --- a/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/eltwise_unary_vx.c @@ -240,7 +240,7 @@ static vsi_nn_kernel_node_t _setup node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t); if( NULL == node ) { - VSILOGE("Call vxTensorTableLookupLayer fail."); + VSILOGW("Call vxTensorTableLookupLayer fail."); goto OnError; } diff --git a/src/tim/vx/internal/src/kernel/vx/matrixmul_vx.c b/src/tim/vx/internal/src/kernel/vx/matrixmul_vx.c new file mode 100644 index 0000000..bdbc6c4 --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/matrixmul_vx.c @@ -0,0 +1,91 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +#if VX_BATCH_GEMM_API_SUPPORT + +#define REGISTER_BATCH_GEMM_OPENVX_KERNEL( kernel_name ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ); \ + REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \ + static vsi_nn_kernel_node_t _##kernel_name##setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num,\ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) + +REGISTER_BATCH_GEMM_OPENVX_KERNEL( matrixmul ) +{ + vx_node node = NULL; + int32_t transposeA = vsi_nn_kernel_param_get_int32(params, "transposeA"); + int32_t transposeB = vsi_nn_kernel_param_get_int32(params, "transposeB"); + vx_scalar trans_a = vxCreateScalar(graph->ctx->c, VX_TYPE_BOOL, &transposeA); + vx_scalar trans_b = vxCreateScalar(graph->ctx->c, VX_TYPE_BOOL, &transposeB); + + node = vxBatchGemmNode(graph->g, + inputs[0]->t, + inputs[1]->t, + NULL, + trans_a, + trans_b, + NULL, + outputs[0]->t); + + if( NULL == node ) + { + VSILOGW("Call vxBatchGemmNode fail."); + goto OnError; + } + +OnError: + if (trans_a) vxReleaseScalar(&trans_a); + if (trans_b) vxReleaseScalar(&trans_b); + + return (vsi_nn_kernel_node_t)node; +} /* matrixmul() */ + +#undef REGISTER_BATCH_GEMM_OPENVX_KERNEL + +#endif + diff --git a/src/tim/vx/internal/src/kernel/vx/prelu_vx.c b/src/tim/vx/internal/src/kernel/vx/prelu_vx.c index 3a8a861..4728ad6 100644 --- a/src/tim/vx/internal/src/kernel/vx/prelu_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/prelu_vx.c @@ -38,8 +38,8 @@ static vsi_nn_tensor_t * _reshape_to_1d_tensor { vsi_nn_tensor_t *tensor = NULL; uint32_t i = 0; - uint32_t size = 0; - int32_t shapes[VSI_NN_MAX_DIM_NUM] = { 1 }; + vsi_size_t size = 0; + vsi_size_t shapes[VSI_NN_MAX_DIM_NUM] = { 1 }; uint32_t one_rank = 0; for (i = 0; i < input->attr.dim_num; i++) @@ -61,7 +61,7 @@ static vsi_nn_tensor_t * _reshape_to_1d_tensor return NULL; } - tensor = vsi_nn_reshape_tensor( graph, input, (uint32_t*)shapes, 1 ); + tensor = vsi_nn_reshape_tensor( graph, input, shapes, 1 ); return tensor; } diff --git a/src/tim/vx/internal/src/kernel/vx/relu_keras_vx.c b/src/tim/vx/internal/src/kernel/vx/relu_keras_vx.c index 9afde85..9c5b0cb 100644 --- a/src/tim/vx/internal/src/kernel/vx/relu_keras_vx.c +++ b/src/tim/vx/internal/src/kernel/vx/relu_keras_vx.c @@ -157,7 +157,7 @@ static vsi_nn_kernel_node_t _setup node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t); if( NULL == node ) { - VSILOGE("Call vxTensorTableLookupLayer fail."); + VSILOGW("Call vxTensorTableLookupLayer fail."); goto OnError; } diff --git a/src/tim/vx/internal/src/kernel/vx/square_vx.c b/src/tim/vx/internal/src/kernel/vx/square_vx.c new file mode 100644 index 0000000..839890b --- /dev/null +++ b/src/tim/vx/internal/src/kernel/vx/square_vx.c @@ -0,0 +1,209 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ + +#include "vsi_nn_types.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_node.h" +#include "vsi_nn_log.h" +#include "vsi_nn_prv.h" +#include +#include "utils/vsi_nn_dtype_util_prv.h" +#include "vsi_nn_tensor_util.h" +#include "kernel/vsi_nn_kernel.h" + +typedef struct _sort_lut_s +{ + float index; + float val; +} sort_lut; + +static float square_eval(float x) +{ + return x * x; +} + +#ifdef VX_USER_LOOKUP_TABLE_SUPPORT +static int32_t _lut_comparator(const void *pa, const void *pb) +{ + sort_lut a = *(sort_lut *)pa; + sort_lut b = *(sort_lut *)pb; + float diff = a.index - b.index; + if ( diff > 0 ) + { + return 1; + } + else if ( diff < 0 ) + { + return -1; + } + + return 0; +} + +static void _set_table_lookup(float func(float), float *index, float *value) +{ +#define VSI_NN_MAX_LUT_SIZE (1024) +#define FLT16_MAX (57344) +#define FLT16_MIN (-57344) + uint32_t i = 0; + sort_lut *lut = (sort_lut *)calloc(VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut)); + + for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++) + { + int16_t val = (int16_t)(i << 6); + lut[i].index = fp16_to_fp32(val); + lut[i].val = func(lut[i].index); + } + + for (i = 0x0; i < 0x10; i++) + { + lut[i].index = 0; + lut[i].val = func(lut[i].index); + } + + for (i = 0x1F0; i < 0x200; i++) + { + lut[i].index = FLT16_MAX; + lut[i].val = func(lut[i].index); + } + + for (i = 0x3F0; i < 0x400; i++) + { + lut[i].index = FLT16_MIN; + lut[i].val = func(lut[i].index); + } + + qsort(lut, VSI_NN_MAX_LUT_SIZE, sizeof(sort_lut), _lut_comparator); + + for ( i = 0; i < VSI_NN_MAX_LUT_SIZE; i++) + { + index[i] = lut[i].index; + value[i] = lut[i].val; + } + + vsi_nn_safe_free(lut); + +#undef VSI_NN_MAX_LUT_SIZE +#undef FLT16_MIN +#undef FLT16_MAX +} +#endif + +static vsi_nn_kernel_node_t _setup + ( + vsi_nn_graph_t * graph, + vsi_nn_tensor_t ** inputs, + size_t input_num, + vsi_nn_tensor_t ** outputs, + size_t output_num, + const vsi_nn_kernel_param_t * params, + vsi_nn_kernel_t * kernel, + float func(float) + ) +{ + vx_node node = NULL; +#ifdef VX_USER_LOOKUP_TABLE_SUPPORT + vx_lut lut1 = NULL; + vx_lut lut2 = NULL; + float index[1024] = {0}; + float value[1024] = {0}; + + if ( inputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 || + outputs[0]->attr.dtype.vx_type == VSI_NN_TYPE_INT32 ) + { + return NULL; + } + + _set_table_lookup(func, index, value); + + lut1 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024); + lut2 = vxCreateLUT( graph->ctx->c, VX_TYPE_FLOAT32, 1024); + if( NULL == lut1 || NULL == lut2 ) + { + VSILOGE("create lut object fail."); + goto OnError; + } + + vxCopyLUT(lut1, (void*)&index, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); + vxCopyLUT(lut2, (void*)&value, VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST); + + node = vxTensorTableLookupLayer( graph->g, inputs[0]->t, lut1, lut2, outputs[0]->t); + if( NULL == node ) + { + node = vxActivationLayer( + graph->g, + inputs[0]->t, + VX_NN_ACTIVATION_SQUARE, + 0, + 0, + outputs[0]->t + ); + } + +OnError: + if (lut1) + { + vxReleaseLUT(&lut1); + lut1 = NULL; + } + if (lut2) + { + vxReleaseLUT(&lut2); + lut2 = NULL; + } + return (vsi_nn_kernel_node_t)node; +#else + node = vxActivationLayer( + graph->g, + inputs[0]->t, + VX_NN_ACTIVATION_SQUARE, + 0, + 0, + outputs[0]->t + ); + + return (vsi_nn_kernel_node_t)node; +#endif +} /* _setup() */ + +#define REGISTER_SQUARE_OPENVX_KERNEL(KERNEL_NAME, ACT_FUNC) \ + static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \ + ( \ + vsi_nn_graph_t * graph, \ + vsi_nn_tensor_t ** inputs, \ + size_t input_num, \ + vsi_nn_tensor_t ** outputs, \ + size_t output_num, \ + const vsi_nn_kernel_param_t * params, \ + vsi_nn_kernel_t * kernel \ + ) \ + { \ + return _setup(graph, inputs, input_num, outputs, output_num, \ + params, kernel, ACT_FUNC); \ + } \ + REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup ) + +REGISTER_SQUARE_OPENVX_KERNEL( square, square_eval ) + +#undef REGISTER_SQUARE_OPENVX_KERNEL diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/batchnorm_single.cl b/src/tim/vx/internal/src/libnnext/ops/cl/batchnorm_single.cl index e2ed333..316e5e8 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/batchnorm_single.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/batchnorm_single.cl @@ -1,138 +1,219 @@ - -#define READ_IMAGEF_ARRAY2D(dest, tensor, coord) \ - do { \ - int depth = get_image_array_size(tensor); \ - _viv_asm(CLAMP0MAX, coord_in0.z, coord_in0.z, in0_depth - 1); \ - dest = read_imagef(tensor, coord); \ - } while(0) -__kernel void batch_norm_F32toF32 - ( - __read_only image2d_array_t input, - __read_only image2d_array_t Mean, - __read_only image2d_array_t Variance, - __read_only image2d_array_t Gamma, - __read_only image2d_array_t Beta, - __write_only image2d_array_t output, - float eps, - float input_scale, - float input_tail, - float output_scale, - float output_zp - ) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - float4 src, mean, var, gamma, beta; - READ_IMAGEF_2DARRAY(src, input, coord); - READ_IMAGEF_2DARRAY(mean, Mean, coord); - READ_IMAGEF_2DARRAY(var, Variance, coord); - READ_IMAGEF_2DARRAY(gamma, Gamma, coord); - READ_IMAGEF_2DARRAY(beta, Beta, coord); - - float4 dst; - src.x = src.x - mean.x; - float inv = rsqrt(var.x + eps); - dst.x = src.x * inv *gamma.x + beta.x; - - write_imagef(output, coord, dst); -} - -__kernel void batch_norm_F32toF32_2D - ( - __read_only image2d_t input, - __read_only image2d_t Mean, - __read_only image2d_t Variance, - __read_only image2d_t Gamma, - __read_only image2d_t Beta, - __write_only image2d_t output, - float eps, - float input_scale, - float input_tail, - float output_scale, - float output_zp - ) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - float4 src = read_imagef(input, coord); - float4 mean = read_imagef(Mean, coord); - float4 var = read_imagef(Variance, coord); - float4 gamma = read_imagef(Gamma, coord); - float4 beta = read_imagef(Beta, coord); - - float4 dst = 0; - src.x = src.x - mean.x; - float inv = rsqrt(var.x + eps); - dst.x = src.x * inv *gamma.x + beta.x; - - write_imagef(output, coord, dst); -} - -__kernel void batch_norm_U8toU8 - ( - __read_only image2d_array_t input, - __read_only image2d_array_t Mean, - __read_only image2d_array_t Variance, - __read_only image2d_array_t Gamma, - __read_only image2d_array_t Beta, - __write_only image2d_array_t output, - float eps, - float input_scale, - float input_tail, - float output_scale, - float output_zp - ) -{ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); - - uint4 data; - float4 src, mean, var, gamma, beta; - READ_IMAGEF_2DARRAY(data, input, coord); - READ_IMAGEF_2DARRAY(mean, Mean, coord); - READ_IMAGEF_2DARRAY(var, Variance, coord); - READ_IMAGEF_2DARRAY(gamma, Gamma, coord); - READ_IMAGEF_2DARRAY(beta, Beta, coord); - - src = convert_float4(data) * input_scale - input_tail; - src.x = src.x - mean.x; - float inv = rsqrt(var.x + eps); - src.x = src.x * inv *gamma.x + beta.x; - - uint4 dst = convert_uint4(src * output_scale + output_zp); - +#define BN_U8_SAVE \ + uint4 dst = convert_uint4(src * output_scale + output_zp); \ write_imageui(output, coord, dst); + +#define BN_I32_SAVE \ + int4 dst = convert_int4(src * output_scale + output_zp); \ + write_imagei(output, coord, dst); + +#define BN_F32_SAVE \ + write_imagef(output, coord, src); + +#define BATCH_NORM_F32_SH_IMPL(TYPE) \ +__kernel void batch_norm_F32to##TYPE \ + ( \ + __read_only image2d_array_t input, \ + __read_only image2d_array_t Mean, \ + __read_only image2d_array_t Variance, \ + __read_only image2d_array_t Gamma, \ + __read_only image2d_array_t Beta, \ + __write_only image2d_array_t output, \ + float eps, \ + float input_scale, \ + float input_tail, \ + float output_scale, \ + float output_zp \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + float4 src, mean, var, gamma, beta; \ + READ_IMAGEF_2DARRAY(src, input, coord); \ + READ_IMAGEF_2DARRAY(mean, Mean, coord); \ + READ_IMAGEF_2DARRAY(var, Variance, coord); \ + READ_IMAGEF_2DARRAY(gamma, Gamma, coord); \ + READ_IMAGEF_2DARRAY(beta, Beta, coord); \ + \ + src.x = src.x - mean.x; \ + float inv = rsqrt(var.x + eps); \ + src.x = src.x * inv *gamma.x + beta.x; \ + \ + BN_##TYPE##_SAVE \ } +BATCH_NORM_F32_SH_IMPL(F32) +BATCH_NORM_F32_SH_IMPL(U8) +BATCH_NORM_F32_SH_IMPL(I32) -__kernel void batch_norm_U8toU8_2D - ( - __read_only image2d_t input, - __read_only image2d_t Mean, - __read_only image2d_t Variance, - __read_only image2d_t Gamma, - __read_only image2d_t Beta, - __write_only image2d_t output, - float eps, - float input_scale, - float input_tail, - float output_scale, - float output_zp - ) -{ - int2 coord = (int2)(get_global_id(0), get_global_id(1)); - - uint4 data = read_imageui(input, coord); - float4 mean = read_imagef(Mean, coord); - float4 var = read_imagef(Variance, coord); - float4 gamma = read_imagef(Gamma, coord); - float4 beta = read_imagef(Beta, coord); - - float4 src = convert_float4(data) * input_scale - input_tail; - src.x = src.x - mean.x; - float inv = rsqrt(var.x + eps); - src.x = src.x * inv *gamma.x + beta.x; - - uint4 dst = convert_uint4(src * output_scale + output_zp); - - write_imageui(output, coord, dst); +#define BATCH_NORM_F32_SH_IMPL_2D(TYPE) \ +__kernel void batch_norm_F32to##TYPE##_2D \ + ( \ + __read_only image2d_t input, \ + __read_only image2d_t Mean, \ + __read_only image2d_t Variance, \ + __read_only image2d_t Gamma, \ + __read_only image2d_t Beta, \ + __write_only image2d_t output, \ + float eps, \ + float input_scale, \ + float input_tail, \ + float output_scale, \ + float output_zp \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + \ + float4 src = read_imagef(input, coord); \ + float4 mean = read_imagef(Mean, coord); \ + float4 var = read_imagef(Variance, coord); \ + float4 gamma = read_imagef(Gamma, coord); \ + float4 beta = read_imagef(Beta, coord); \ + \ + src.x = src.x - mean.x; \ + float inv = rsqrt(var.x + eps); \ + src.x = src.x * inv *gamma.x + beta.x; \ + \ + BN_##TYPE##_SAVE \ } +BATCH_NORM_F32_SH_IMPL_2D(F32) +BATCH_NORM_F32_SH_IMPL_2D(U8) +BATCH_NORM_F32_SH_IMPL_2D(I32) +#define BATCH_NORM_U8_SH_IMPL(TYPE) \ +__kernel void batch_norm_U8to##TYPE \ + ( \ + __read_only image2d_array_t input, \ + __read_only image2d_array_t Mean, \ + __read_only image2d_array_t Variance, \ + __read_only image2d_array_t Gamma, \ + __read_only image2d_array_t Beta, \ + __write_only image2d_array_t output, \ + float eps, \ + float input_scale, \ + float input_tail, \ + float output_scale, \ + float output_zp \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + uint4 data; \ + float4 src, mean, var, gamma, beta; \ + READ_IMAGEUI_2DARRAY(data, input, coord); \ + READ_IMAGEF_2DARRAY(mean, Mean, coord); \ + READ_IMAGEF_2DARRAY(var, Variance, coord); \ + READ_IMAGEF_2DARRAY(gamma, Gamma, coord); \ + READ_IMAGEF_2DARRAY(beta, Beta, coord); \ + \ + src = convert_float4(data) * input_scale - input_tail; \ + src.x = src.x - mean.x; \ + float inv = rsqrt(var.x + eps); \ + src.x = src.x * inv *gamma.x + beta.x; \ + \ + BN_##TYPE##_SAVE \ +} +BATCH_NORM_U8_SH_IMPL(U8) +BATCH_NORM_U8_SH_IMPL(F32) + +#define BATCH_NORM_U8_SH_IMPL_2D(TYPE) \ +__kernel void batch_norm_U8to##TYPE##_2D \ + ( \ + __read_only image2d_t input, \ + __read_only image2d_t Mean, \ + __read_only image2d_t Variance, \ + __read_only image2d_t Gamma, \ + __read_only image2d_t Beta, \ + __write_only image2d_t output, \ + float eps, \ + float input_scale, \ + float input_tail, \ + float output_scale, \ + float output_zp \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + \ + uint4 data = read_imageui(input, coord); \ + float4 mean = read_imagef(Mean, coord); \ + float4 var = read_imagef(Variance, coord); \ + float4 gamma = read_imagef(Gamma, coord); \ + float4 beta = read_imagef(Beta, coord); \ + \ + float4 src = convert_float4(data) * input_scale - input_tail; \ + src.x = src.x - mean.x; \ + float inv = rsqrt(var.x + eps); \ + src.x = src.x * inv *gamma.x + beta.x; \ + \ + BN_##TYPE##_SAVE \ +} +BATCH_NORM_U8_SH_IMPL_2D(U8) +BATCH_NORM_U8_SH_IMPL_2D(F32) + +#define BATCH_NORM_I32_SH_IMPL(TYPE) \ +__kernel void batch_norm_I32to##TYPE \ + ( \ + __read_only image2d_array_t input, \ + __read_only image2d_array_t Mean, \ + __read_only image2d_array_t Variance, \ + __read_only image2d_array_t Gamma, \ + __read_only image2d_array_t Beta, \ + __write_only image2d_array_t output, \ + float eps, \ + float input_scale, \ + float input_tail, \ + float output_scale, \ + float output_zp \ + ) \ +{ \ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \ + \ + int4 data; \ + float4 src, mean, var, gamma, beta; \ + READ_IMAGEI_2DARRAY(data, input, coord); \ + READ_IMAGEF_2DARRAY(mean, Mean, coord); \ + READ_IMAGEF_2DARRAY(var, Variance, coord); \ + READ_IMAGEF_2DARRAY(gamma, Gamma, coord); \ + READ_IMAGEF_2DARRAY(beta, Beta, coord); \ + \ + src = convert_float4(data) * input_scale - input_tail; \ + src.x = src.x - mean.x; \ + float inv = rsqrt(var.x + eps); \ + src.x = src.x * inv *gamma.x + beta.x; \ + \ + BN_##TYPE##_SAVE \ +} +BATCH_NORM_I32_SH_IMPL(I32) +BATCH_NORM_I32_SH_IMPL(F32) + +#define BATCH_NORM_I32_SH_IMPL_2D(TYPE) \ +__kernel void batch_norm_I32to##TYPE##_2D \ + ( \ + __read_only image2d_t input, \ + __read_only image2d_t Mean, \ + __read_only image2d_t Variance, \ + __read_only image2d_t Gamma, \ + __read_only image2d_t Beta, \ + __write_only image2d_t output, \ + float eps, \ + float input_scale, \ + float input_tail, \ + float output_scale, \ + float output_zp \ + ) \ +{ \ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \ + \ + int4 data = read_imagei(input, coord); \ + float4 mean = read_imagef(Mean, coord); \ + float4 var = read_imagef(Variance, coord); \ + float4 gamma = read_imagef(Gamma, coord); \ + float4 beta = read_imagef(Beta, coord); \ + \ + float4 src = convert_float4(data) * input_scale - input_tail; \ + src.x = src.x - mean.x; \ + float inv = rsqrt(var.x + eps); \ + src.x = src.x * inv *gamma.x + beta.x; \ + \ + BN_##TYPE##_SAVE \ +} +BATCH_NORM_I32_SH_IMPL_2D(I32) +BATCH_NORM_I32_SH_IMPL_2D(F32) \ No newline at end of file diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl index 37f1db8..34668c1 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/eltwise_ops_helper.cl @@ -15,20 +15,23 @@ inline uchar* get_image_ptr_from_coord(Image img, int2 coord) inline Image create_image_from_image2d(image2d_t input, int stride_x) { + int stride_y; +#if (USE_40BITS_VA==0) int8 desc; +#else + int8 desc; + _viv_asm(GET_IMAGE_STRIDE, stride_y, input); +#endif _viv_asm(COPY, desc, input, sizeof(desc)); + uint address = as_uint(desc.s0); #if (USE_40BITS_VA==0) - uint address = as_uint(desc.s0); - int stride_y = desc.s1; -#else - ulong address = as_ulong(desc.s05); - int stride_y = desc.s6; + stride_y = desc.s1; #endif Image img = { - .ptr = (uchar*)address, + .ptr = (uchar*)(uintptr_t)address, .stride_x = stride_x, .stride_y = stride_y }; @@ -51,28 +54,23 @@ inline uchar* get_tensor_ptr_from_coord(Tensor t, int4 coord) inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride_x) { -#if (USE_40BITS_VA==0) int8 desc; - _viv_asm(COPY, desc, input, sizeof(desc)); - - uint address = as_uint(desc.s0); - int stride_y = desc.s1; - int stride_z = desc.s4; + int2 strides; +#if (USE_40BITS_VA==0) + strides.x = desc.s1; + strides.y = desc.s4; #else - int16 desc; - _viv_asm(COPY, desc, input, sizeof(desc)); - - ulong address = as_ulong(desc.s05); - int stride_y = desc.s6; - int stride_z = desc.sa; + _viv_asm(GET_IMAGE_STRIDE, strides, input); #endif + _viv_asm(COPY, desc, input, sizeof(desc)); + uint address = as_uint(desc.s0); Tensor t = { - .ptr = (uchar*)address, + .ptr = (uchar*)(uintptr_t)address, .stride_x = stride_x, - .stride_y = stride_y, - .stride_z = stride_z + .stride_y = strides.x, + .stride_z = strides.y }; return t; diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis0.cl index 6a71a4f..8b4dd55 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis0.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis0.cl @@ -1,4 +1,4 @@ -__kernel void moments_axis0_U8toF16( +__kernel void moments_axis0_U8toF32( __read_only image2d_array_t input, __write_only image2d_t output_mean, __write_only image2d_t output_vari, @@ -29,8 +29,8 @@ __kernel void moments_axis0_U8toF16( tmpSum += (data); tmpSqr += (data * data); } - sqr = convert_float(tmpSqr - 2 * input_zp * tmpSum + width * input_zp * input_zp) * e2InScale; - sum = convert_float(tmpSum - width * input_zp) * input_scale; + sqr = convert_float(as_int(tmpSqr - 2 * input_zp * tmpSum + width * input_zp * input_zp)) * e2InScale; + sum = convert_float(as_int(tmpSum - width * input_zp)) * input_scale; } float4 mean, vari; mean.x = sum * dimRatio; @@ -75,7 +75,6 @@ __kernel void moments_axis0_##src0_type_name##to##src0_type_name( \ write_imagef(output_mean, coord_out, mean); \ write_imagef(output_vari, coord_out, vari); \ } -MOMENTS_AXIS0_F(F16) MOMENTS_AXIS0_F(F32) __kernel void moments_axis0_I32toF32( @@ -96,20 +95,21 @@ __kernel void moments_axis0_I32toF32( int gidz = get_global_id(1); int4 coord0 = (int4)(0, gidy, gidz, 0); - int data; - int sum = 0, sqr = 0; + float data; + float sum = 0, sqr = 0; for(coord0.x = 0; coord0.x < width;) { - data = read_imagei(input, coord0).x; + data = convert_float(read_imagei(input, coord0).x); coord0.x++; - sum += (data); - sqr += (data * data); + + sum = sum + data; + sqr = sqr + data * data; } float4 mean, vari; - mean.x = sum * dimRatio; - vari.x = sqr * dimRatio; + mean.x = sum * dimRatio * input_scale; + vari.x = sqr * dimRatio * input_scale * input_scale; vari.x = vari.x - mean.x * mean.x; int2 coord_out = (int2)(gidy, gidz); diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl index f7c64cf..a89ec8a 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis01.cl @@ -1,4 +1,4 @@ -__kernel void moments_axis01_U8toF16( +__kernel void moments_axis01_U8toF32( image2d_array_t input, image2d_t output_mean, image2d_t output_vari, int axis, int axis_num, int input_zp, float input_scale, int width, int height, int chn, float dimRatio @@ -112,7 +112,6 @@ __kernel void moments_axis01_##src0_type_name##to##src0_type_name( \ write_imagef(output_vari, coord_out, vari); \ } \ } -MOMENTS_AXIS01_F(F16) MOMENTS_AXIS01_F(F32) __kernel void moments_axis01_I32toF32( @@ -126,7 +125,7 @@ __kernel void moments_axis01_I32toF32( int lidx = get_local_id(0); int4 coord = (int4)(gidx, 0, gidz, 0); - int4 data; + float4 data; float sum = 0, sqr = 0; float e2InScale = input_scale * input_scale; @@ -135,13 +134,14 @@ __kernel void moments_axis01_I32toF32( for(coord.x = gidx; coord.x < width; coord.x += 16) { - int tmpSum = 0, tmpSqr = 0; + float tmpSum = 0, tmpSqr = 0; for(coord.y = 0; coord.y < height;) { - data = read_imagei(input, coord); + data = convert_float4(read_imagei(input, coord)); coord.y++; - tmpSum += data.x; - tmpSqr += data.x * data.x; + + tmpSum = tmpSum + data.x; + tmpSqr = tmpSqr + data.x * data.x; } sqr += (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale; sum += (tmpSum - height * input_zp) * input_scale; diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis012.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis012.cl index 28a4fc3..fa0ce44 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis012.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis012.cl @@ -1,4 +1,4 @@ -__kernel void moments_axis012_U8toF16( +__kernel void moments_axis012_U8toF32( image2d_array_t input, image2d_t output_mean, image2d_t output_vari, int axis, int axis_num, int input_zp, float input_scale, int width, int height, int chn, float dimRatio @@ -116,7 +116,6 @@ __kernel void moments_axis012_##src0_type_name##to##src0_type_name( \ write_imagef(output_vari, coord_out, vari); \ } \ } -MOMENTS_AXIS012_F(F16) MOMENTS_AXIS012_F(F32) __kernel void moments_axis012_I32toF32( @@ -145,8 +144,8 @@ __kernel void moments_axis012_I32toF32( { data = read_imagei(input, coord); coord.y++; - tmpSum += data.x; - tmpSqr += data.x * data.x; + tmpSum = tmpSum + data.x; + tmpSqr = tmpSqr + data.x * data.x; } sqr += (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale; sum += (tmpSum - height * input_zp) * input_scale; diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis1.cl index 9ba0dc4..a18bdc2 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis1.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis1.cl @@ -1,4 +1,4 @@ -__kernel void moments_axis1_U8toF16( +__kernel void moments_axis1_U8toF32( __read_only image2d_array_t input, __write_only image2d_t output_mean, __write_only image2d_t output_vari, @@ -23,8 +23,8 @@ __kernel void moments_axis1_U8toF16( tmpSum += (data); tmpSqr += (data * data); } - sqr = convert_float(tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale; - sum = convert_float(tmpSum - height * input_zp) * input_scale; + sqr = convert_float(as_int(tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp)) * e2InScale; + sum = convert_float(as_int(tmpSum - height * input_zp)) * input_scale; } float4 mean, vari; @@ -70,7 +70,6 @@ __kernel void moments_axis1_##src0_type_name##to##src0_type_name( \ write_imagef(output_mean, coord_out, mean); \ write_imagef(output_vari, coord_out, vari); \ } -MOMENTS_AXIS1_F(F16) MOMENTS_AXIS1_F(F32) __kernel void moments_axis1_I32toF32( @@ -91,20 +90,20 @@ __kernel void moments_axis1_I32toF32( int gidz = get_global_id(1); int4 coord0 = (int4)(gidx, 0, gidz, 0); - int data; - int sum = 0, sqr = 0; + float data; + float sum = 0, sqr = 0; for(coord0.y = 0; coord0.y < height;) { - data = read_imagei(input, coord0).x; + data = convert_float(read_imagei(input, coord0).x); coord0.y++; - sum += (data); - sqr += (data * data); + sum = sum + data; + sqr = sqr + data * data; } float4 mean, vari; - mean.x = sum * dimRatio; - vari.x = sqr * dimRatio; + mean.x = sum * dimRatio * input_scale; + vari.x = sqr * dimRatio * input_scale * input_scale; vari.x = vari.x - mean.x * mean.x; int2 coord_out = (int2)(gidx, gidz); diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis2.cl index e15d25a..078cf74 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis2.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/moments_axis2.cl @@ -1,4 +1,4 @@ -__kernel void moments_axis2_U8toF16( +__kernel void moments_axis2_U8toF32( __read_only image2d_array_t input, __write_only image2d_t output_mean, __write_only image2d_t output_vari, @@ -29,12 +29,12 @@ __kernel void moments_axis2_U8toF16( tmpSum += (data); tmpSqr += (data * data); } - sqr = (tmpSqr - 2 * input_zp * tmpSum + chn * input_zp * input_zp) * e2InScale; - sum = (tmpSum - chn * input_zp) * input_scale; + sqr = as_int(tmpSqr - 2 * input_zp * tmpSum + chn * input_zp * input_zp) * e2InScale; + sum = tmpSum * input_scale; } float4 mean, vari; - mean.x = sum * dimRatio; + mean.x = sum * dimRatio - input_zp * input_scale; vari.x = sqr * dimRatio; vari.x = vari.x - mean.x * mean.x; @@ -82,7 +82,6 @@ __kernel void moments_axis2_##src0_type_name##to##src0_type_name( \ write_imagef(output_mean, coord_out, mean); \ write_imagef(output_vari, coord_out, vari); \ } -MOMENTS_AXIS2_F(F16) MOMENTS_AXIS2_F(F32) __kernel void moments_axis2_I32toF32( @@ -103,20 +102,22 @@ __kernel void moments_axis2_I32toF32( int gidy = get_global_id(1); int4 coord0 = (int4)(gidx, gidy, 0, 0); - int data; - int sum = 0, sqr = 0; + float data; + float sum = 0, sqr = 0; for(coord0.z = 0; coord0.z < chn;) { - data = read_imagei(input, coord0).x; + data = convert_float(read_imagei(input, coord0).x); coord0.z++; - sum += (data); - sqr += (data * data); + + + sum = sum + data; + sqr = sqr + data * data; } float4 mean, vari; - mean.x = sum * dimRatio; - vari.x = sqr * dimRatio; + mean.x = sum * dimRatio * input_scale; + vari.x = sqr * dimRatio * input_scale * input_scale; vari.x = vari.x - mean.x * mean.x; int2 coord_out = (int2)(gidx, gidy); diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/select.cl b/src/tim/vx/internal/src/libnnext/ops/cl/select.cl index ab39f63..fcf97fe 100644 --- a/src/tim/vx/internal/src/libnnext/ops/cl/select.cl +++ b/src/tim/vx/internal/src/libnnext/ops/cl/select.cl @@ -13,8 +13,8 @@ __kernel void select_I8_U8_U8toU8( uint4 src0, src1, src, dst; float inputScale, inputTail; READ_IMAGEI_2DARRAY(value, condition, coord); - READ_IMAGEF_2DARRAY(src0, input0, coord); - READ_IMAGEF_2DARRAY(src1, input1, coord); + READ_IMAGEUI_2DARRAY(src0, input0, coord); + READ_IMAGEUI_2DARRAY(src1, input1, coord); src = (value != 0 ? src0 : src1); inputScale = (value.x != 0 ? input0Scale : input1Scale); inputTail = (value.x != 0 ? input0Tail : input1Tail); diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx index e062f9f..a76cb4f 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/layer_normalization_u8_f16.vx @@ -3,7 +3,6 @@ /*****************************layernorm uint8 to fp16****************************/ _viv_uniform int width; _viv_uniform float dimRatio; -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4; _viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4; _viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4; _viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4; @@ -16,7 +15,6 @@ _viv_uniform int sumInZp; _viv_uniform int tmpZp1; _viv_uniform int tmpZp2; _viv_uniform float e2InScale; -_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4; _viv_uniform VXC_512Bits UniPackFP16even_2x8; __kernel void layer_norm_U8toF16( @@ -73,22 +71,15 @@ __kernel void layer_norm_U8toF16( { VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); coord_bias.x = coord.x; - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); + + scale_f0 = read_imagef(scale, coord_bias); bias_f0 = read_imagef(bias, coord_bias); coord_bias.x += 4; + scale_f1 = read_imagef(scale, coord_bias); bias_f1 = read_imagef(bias, coord_bias); coord_bias.x += 4; - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, scale_h, src1, 16); VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ uniConvert1stUint8SubZpToFp32_4x4); VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ @@ -105,17 +96,18 @@ __kernel void layer_norm_U8toF16( vxc_float4 norm; tmpData0 -= mean; norm = scale_f0 * vari * tmpData0 + bias_f0; + + scale_f0 = read_imagef(scale, coord_bias); bias_f0 = read_imagef(bias, coord_bias); - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); coord_bias.x += 4; _viv_asm(CONV, tmpVal0, norm); tmpData1 -= mean; norm = scale_f1 * vari * tmpData1 + bias_f1; + + scale_f1 = read_imagef(scale, coord_bias); bias_f1 = read_imagef(bias, coord_bias); - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); + _viv_asm(CONV, tmpVal1, norm); VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ UniPackFP16even_2x8); @@ -187,21 +179,14 @@ __kernel void layer_norm_U8toF16_2D( coord_bias.x = coord.x; VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, scale_h, src1, 16); - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); + + scale_f0 = read_imagef(scale, coord_bias); bias_f0 = read_imagef(bias, coord_bias); coord_bias.x += 4; + scale_f1 = read_imagef(scale, coord_bias); bias_f1 = read_imagef(bias, coord_bias); coord_bias.x += 4; - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); - _viv_asm(COPY, scale_h, src1, 16); VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ uniConvert1stUint8SubZpToFp32_4x4); VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ @@ -218,17 +203,19 @@ __kernel void layer_norm_U8toF16_2D( vxc_float4 norm; tmpData0 -= mean; norm = scale_f0 * vari * tmpData0 + bias_f0; + + scale_f0 = read_imagef(scale, coord_bias); bias_f0 = read_imagef(bias, coord_bias); - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - UniFP16toFP32Lo4_dp4x4); + coord_bias.x += 4; _viv_asm(CONV, tmpVal0, norm); tmpData1 -= mean; norm = scale_f1 * vari * tmpData1 + bias_f1; + + scale_f1 = read_imagef(scale, coord_bias); bias_f1 = read_imagef(bias, coord_bias); - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\ - uniConvertSecFp16Fp32_4x4); + _viv_asm(CONV, tmpVal1, norm); VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\ UniPackFP16even_2x8); diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx index 7540ae6..99ac9fb 100644 --- a/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx +++ b/src/tim/vx/internal/src/libnnext/ops/vx/vsi_nn_kernel_header.vx @@ -23,20 +23,23 @@ inline uchar* get_image_ptr_from_coord(Image img, int2 coord) inline Image create_image_from_image2d(image2d_t input, int stride_x) { + int stride_y; +#if (USE_40BITS_VA==0) int8 desc; +#else + int8 desc; + _viv_asm(GET_IMAGE_STRIDE, stride_y, input); +#endif _viv_asm(COPY, desc, input, sizeof(desc)); + uint address = as_uint(desc.s0); #if (USE_40BITS_VA==0) - uint address = as_uint(desc.s0); - int stride_y = desc.s1; -#else - ulong address = as_ulong(desc.s05); - int stride_y = desc.s6; + stride_y = desc.s1; #endif Image img = { - .ptr = (uchar*)address, + .ptr = (uchar*)(uintptr_t)address, .stride_x = stride_x, .stride_y = stride_y }; @@ -59,28 +62,23 @@ inline uchar* get_tensor_ptr_from_coord(Tensor t, int4 coord) inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride_x) { -#if (USE_40BITS_VA==0) int8 desc; - _viv_asm(COPY, desc, input, sizeof(desc)); - - uint address = as_uint(desc.s0); - int stride_y = desc.s1; - int stride_z = desc.s4; + int2 strides; +#if (USE_40BITS_VA==0) + strides.x = desc.s1; + strides.y = desc.s4; #else - int16 desc; - _viv_asm(COPY, desc, input, sizeof(desc)); - - ulong address = as_ulong(desc.s05); - int stride_y = desc.s6; - int stride_z = desc.sa; + _viv_asm(GET_IMAGE_STRIDE, strides, input); #endif + _viv_asm(COPY, desc, input, sizeof(desc)); + uint address = as_uint(desc.s0); Tensor t = { - .ptr = (uchar*)address, + .ptr = (uchar*)(uintptr_t)address, .stride_x = stride_x, - .stride_y = stride_y, - .stride_z = stride_z + .stride_y = strides.x, + .stride_z = strides.y }; return t; diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c index 0b450f0..06624a5 100644 --- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c +++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c @@ -12987,7 +12987,6 @@ static const char layer_normalization_u8_f16_vx[] = "#include \"cl_viv_vx_ext.h\ /*****************************layernorm uint8 to fp16****************************/\n\ _viv_uniform int width;\n\ _viv_uniform float dimRatio;\n\ -_viv_uniform VXC_512Bits UniFP16toFP32Lo4_dp4x4;\n\ _viv_uniform VXC_512Bits uniConvert1stUint8SubZpToFp32_4x4;\n\ _viv_uniform VXC_512Bits uniConvert2ndUint8SubZpToFp32_4x4;\n\ _viv_uniform VXC_512Bits uniConvert3rdUint8SubZpToFp32_4x4;\n\ @@ -13000,7 +12999,6 @@ _viv_uniform int sumInZp;\n\ _viv_uniform int tmpZp1;\n\ _viv_uniform int tmpZp2;\n\ _viv_uniform float e2InScale;\n\ -_viv_uniform VXC_512Bits uniConvertSecFp16Fp32_4x4;\n\ _viv_uniform VXC_512Bits UniPackFP16even_2x8;\n\ \n\ __kernel void layer_norm_U8toF16(\n\ @@ -13057,22 +13055,15 @@ __kernel void layer_norm_U8toF16(\n\ {\n\ VXC_OP4(img_load_3d, src0, input, coord, VXC_5BITOFFSET_XY(0, 0), \\\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ coord_bias.x = coord.x;\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ +\n\ + scale_f0 = read_imagef(scale, coord_bias);\n\ bias_f0 = read_imagef(bias, coord_bias);\n\ coord_bias.x += 4;\n\ + scale_f1 = read_imagef(scale, coord_bias);\n\ bias_f1 = read_imagef(bias, coord_bias);\n\ coord_bias.x += 4;\n\ \n\ - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ uniConvert1stUint8SubZpToFp32_4x4);\n\ VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ @@ -13089,17 +13080,18 @@ __kernel void layer_norm_U8toF16(\n\ vxc_float4 norm;\n\ tmpData0 -= mean;\n\ norm = scale_f0 * vari * tmpData0 + bias_f0;\n\ +\n\ + scale_f0 = read_imagef(scale, coord_bias);\n\ bias_f0 = read_imagef(bias, coord_bias);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ coord_bias.x += 4;\n\ _viv_asm(CONV, tmpVal0, norm);\n\ \n\ tmpData1 -= mean;\n\ norm = scale_f1 * vari * tmpData1 + bias_f1;\n\ +\n\ + scale_f1 = read_imagef(scale, coord_bias);\n\ bias_f1 = read_imagef(bias, coord_bias);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ +\n\ _viv_asm(CONV, tmpVal1, norm);\n\ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ UniPackFP16even_2x8);\n\ @@ -13171,21 +13163,14 @@ __kernel void layer_norm_U8toF16_2D(\n\ coord_bias.x = coord.x;\n\ VXC_ReadImage(src0, input, coord.xy, VXC_5BITOFFSET_XY(0, 0),\\\n\ VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0));\n\ - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(0, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ +\n\ + scale_f0 = read_imagef(scale, coord_bias);\n\ bias_f0 = read_imagef(bias, coord_bias);\n\ coord_bias.x += 4;\n\ + scale_f1 = read_imagef(scale, coord_bias);\n\ bias_f1 = read_imagef(bias, coord_bias);\n\ coord_bias.x += 4;\n\ \n\ - VXC_ReadImage(src1, scale, coord.xw, VXC_5BITOFFSET_XY(8, 0),\\\n\ - VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\ - _viv_asm(COPY, scale_h, src1, 16);\n\ VXC_DP4x4(tmpData0, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ uniConvert1stUint8SubZpToFp32_4x4);\n\ VXC_DP4x4(tmpData1, src0, zp, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ @@ -13202,17 +13187,19 @@ __kernel void layer_norm_U8toF16_2D(\n\ vxc_float4 norm;\n\ tmpData0 -= mean;\n\ norm = scale_f0 * vari * tmpData0 + bias_f0;\n\ +\n\ + scale_f0 = read_imagef(scale, coord_bias);\n\ bias_f0 = read_imagef(bias, coord_bias);\n\ - VXC_DP4x4(scale_f0, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - UniFP16toFP32Lo4_dp4x4);\n\ +\n\ coord_bias.x += 4;\n\ _viv_asm(CONV, tmpVal0, norm);\n\ \n\ tmpData1 -= mean;\n\ norm = scale_f1 * vari * tmpData1 + bias_f1;\n\ +\n\ + scale_f1 = read_imagef(scale, coord_bias);\n\ bias_f1 = read_imagef(bias, coord_bias);\n\ - VXC_DP4x4(scale_f1, scale_h, scale_h, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\ - uniConvertSecFp16Fp32_4x4);\n\ +\n\ _viv_asm(CONV, tmpVal1, norm);\n\ VXC_DP2x8(dst, tmpVal0, tmpVal1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\ UniPackFP16even_2x8);\n\ @@ -41416,20 +41403,23 @@ inline uchar* get_image_ptr_from_coord(Image img, int2 coord)\n\ \n\ inline Image create_image_from_image2d(image2d_t input, int stride_x)\n\ {\n\ + int stride_y;\n\ +#if (USE_40BITS_VA==0)\n\ int8 desc;\n\ +#else\n\ + int8 desc;\n\ + _viv_asm(GET_IMAGE_STRIDE, stride_y, input);\n\ +#endif\n\ _viv_asm(COPY, desc, input, sizeof(desc));\n\ + uint address = as_uint(desc.s0);\n\ \n\ #if (USE_40BITS_VA==0)\n\ - uint address = as_uint(desc.s0);\n\ - int stride_y = desc.s1;\n\ -#else\n\ - ulong address = as_ulong(desc.s05);\n\ - int stride_y = desc.s6;\n\ + stride_y = desc.s1;\n\ #endif\n\ \n\ Image img =\n\ {\n\ - .ptr = (uchar*)address,\n\ + .ptr = (uchar*)(uintptr_t)address,\n\ .stride_x = stride_x,\n\ .stride_y = stride_y\n\ };\n\ @@ -41452,28 +41442,23 @@ inline uchar* get_tensor_ptr_from_coord(Tensor t, int4 coord)\n\ \n\ inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride_x)\n\ {\n\ -#if (USE_40BITS_VA==0)\n\ int8 desc;\n\ - _viv_asm(COPY, desc, input, sizeof(desc));\n\ -\n\ - uint address = as_uint(desc.s0);\n\ - int stride_y = desc.s1;\n\ - int stride_z = desc.s4;\n\ + int2 strides;\n\ +#if (USE_40BITS_VA==0)\n\ + strides.x = desc.s1;\n\ + strides.y = desc.s4;\n\ #else\n\ - int16 desc;\n\ - _viv_asm(COPY, desc, input, sizeof(desc));\n\ -\n\ - ulong address = as_ulong(desc.s05);\n\ - int stride_y = desc.s6;\n\ - int stride_z = desc.sa;\n\ + _viv_asm(GET_IMAGE_STRIDE, strides, input);\n\ #endif\n\ + _viv_asm(COPY, desc, input, sizeof(desc));\n\ + uint address = as_uint(desc.s0);\n\ \n\ Tensor t =\n\ {\n\ - .ptr = (uchar*)address,\n\ + .ptr = (uchar*)(uintptr_t)address,\n\ .stride_x = stride_x,\n\ - .stride_y = stride_y,\n\ - .stride_z = stride_z\n\ + .stride_y = strides.x,\n\ + .stride_z = strides.y\n\ };\n\ \n\ return t;\n\ @@ -42755,145 +42740,225 @@ __kernel void argmin_axis2_I32toI32_2D\n\ \n\ "; /* end of argmin_axis2_cl*/ -static const char batchnorm_single_cl[] = "\n\ -#define READ_IMAGEF_ARRAY2D(dest, tensor, coord) \\\n\ - do { \\\n\ - int depth = get_image_array_size(tensor); \\\n\ - _viv_asm(CLAMP0MAX, coord_in0.z, coord_in0.z, in0_depth - 1); \\\n\ - dest = read_imagef(tensor, coord); \\\n\ - } while(0)\n\ -__kernel void batch_norm_F32toF32\n\ - (\n\ - __read_only image2d_array_t input,\n\ - __read_only image2d_array_t Mean,\n\ - __read_only image2d_array_t Variance,\n\ - __read_only image2d_array_t Gamma,\n\ - __read_only image2d_array_t Beta,\n\ - __write_only image2d_array_t output,\n\ - float eps,\n\ - float input_scale,\n\ - float input_tail,\n\ - float output_scale,\n\ - float output_zp\n\ - )\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - float4 src, mean, var, gamma, beta;\n\ - READ_IMAGEF_2DARRAY(src, input, coord);\n\ - READ_IMAGEF_2DARRAY(mean, Mean, coord);\n\ - READ_IMAGEF_2DARRAY(var, Variance, coord);\n\ - READ_IMAGEF_2DARRAY(gamma, Gamma, coord);\n\ - READ_IMAGEF_2DARRAY(beta, Beta, coord);\n\ -\n\ - float4 dst;\n\ - src.x = src.x - mean.x;\n\ - float inv = rsqrt(var.x + eps);\n\ - dst.x = src.x * inv *gamma.x + beta.x;\n\ -\n\ - write_imagef(output, coord, dst);\n\ -}\n\ -\n\ -__kernel void batch_norm_F32toF32_2D\n\ - (\n\ - __read_only image2d_t input,\n\ - __read_only image2d_t Mean,\n\ - __read_only image2d_t Variance,\n\ - __read_only image2d_t Gamma,\n\ - __read_only image2d_t Beta,\n\ - __write_only image2d_t output,\n\ - float eps,\n\ - float input_scale,\n\ - float input_tail,\n\ - float output_scale,\n\ - float output_zp\n\ - )\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - float4 src = read_imagef(input, coord);\n\ - float4 mean = read_imagef(Mean, coord);\n\ - float4 var = read_imagef(Variance, coord);\n\ - float4 gamma = read_imagef(Gamma, coord);\n\ - float4 beta = read_imagef(Beta, coord);\n\ -\n\ - float4 dst = 0;\n\ - src.x = src.x - mean.x;\n\ - float inv = rsqrt(var.x + eps);\n\ - dst.x = src.x * inv *gamma.x + beta.x;\n\ -\n\ - write_imagef(output, coord, dst);\n\ -}\n\ -\n\ -__kernel void batch_norm_U8toU8\n\ - (\n\ - __read_only image2d_array_t input,\n\ - __read_only image2d_array_t Mean,\n\ - __read_only image2d_array_t Variance,\n\ - __read_only image2d_array_t Gamma,\n\ - __read_only image2d_array_t Beta,\n\ - __write_only image2d_array_t output,\n\ - float eps,\n\ - float input_scale,\n\ - float input_tail,\n\ - float output_scale,\n\ - float output_zp\n\ - )\n\ -{\n\ - int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\ -\n\ - uint4 data;\n\ - float4 src, mean, var, gamma, beta;\n\ - READ_IMAGEF_2DARRAY(data, input, coord);\n\ - READ_IMAGEF_2DARRAY(mean, Mean, coord);\n\ - READ_IMAGEF_2DARRAY(var, Variance, coord);\n\ - READ_IMAGEF_2DARRAY(gamma, Gamma, coord);\n\ - READ_IMAGEF_2DARRAY(beta, Beta, coord);\n\ -\n\ - src = convert_float4(data) * input_scale - input_tail;\n\ - src.x = src.x - mean.x;\n\ - float inv = rsqrt(var.x + eps);\n\ - src.x = src.x * inv *gamma.x + beta.x;\n\ -\n\ - uint4 dst = convert_uint4(src * output_scale + output_zp);\n\ -\n\ +static const char batchnorm_single_cl[] = "#define BN_U8_SAVE \\\n\ + uint4 dst = convert_uint4(src * output_scale + output_zp); \\\n\ write_imageui(output, coord, dst);\n\ +\n\ +#define BN_I32_SAVE \\\n\ + int4 dst = convert_int4(src * output_scale + output_zp); \\\n\ + write_imagei(output, coord, dst);\n\ +\n\ +#define BN_F32_SAVE \\\n\ + write_imagef(output, coord, src);\n\ +\n\ +#define BATCH_NORM_F32_SH_IMPL(TYPE) \\\n\ +__kernel void batch_norm_F32to##TYPE \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_array_t Mean, \\\n\ + __read_only image2d_array_t Variance, \\\n\ + __read_only image2d_array_t Gamma, \\\n\ + __read_only image2d_array_t Beta, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, \\\n\ + float input_scale, \\\n\ + float input_tail, \\\n\ + float output_scale, \\\n\ + float output_zp \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + float4 src, mean, var, gamma, beta; \\\n\ + READ_IMAGEF_2DARRAY(src, input, coord); \\\n\ + READ_IMAGEF_2DARRAY(mean, Mean, coord); \\\n\ + READ_IMAGEF_2DARRAY(var, Variance, coord); \\\n\ + READ_IMAGEF_2DARRAY(gamma, Gamma, coord); \\\n\ + READ_IMAGEF_2DARRAY(beta, Beta, coord); \\\n\ + \\\n\ + src.x = src.x - mean.x; \\\n\ + float inv = rsqrt(var.x + eps); \\\n\ + src.x = src.x * inv *gamma.x + beta.x; \\\n\ + \\\n\ + BN_##TYPE##_SAVE \\\n\ }\n\ +BATCH_NORM_F32_SH_IMPL(F32)\n\ +BATCH_NORM_F32_SH_IMPL(U8)\n\ +BATCH_NORM_F32_SH_IMPL(I32)\n\ \n\ -__kernel void batch_norm_U8toU8_2D\n\ - (\n\ - __read_only image2d_t input,\n\ - __read_only image2d_t Mean,\n\ - __read_only image2d_t Variance,\n\ - __read_only image2d_t Gamma,\n\ - __read_only image2d_t Beta,\n\ - __write_only image2d_t output,\n\ - float eps,\n\ - float input_scale,\n\ - float input_tail,\n\ - float output_scale,\n\ - float output_zp\n\ - )\n\ -{\n\ - int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\ -\n\ - uint4 data = read_imageui(input, coord);\n\ - float4 mean = read_imagef(Mean, coord);\n\ - float4 var = read_imagef(Variance, coord);\n\ - float4 gamma = read_imagef(Gamma, coord);\n\ - float4 beta = read_imagef(Beta, coord);\n\ -\n\ - float4 src = convert_float4(data) * input_scale - input_tail;\n\ - src.x = src.x - mean.x;\n\ - float inv = rsqrt(var.x + eps);\n\ - src.x = src.x * inv *gamma.x + beta.x;\n\ -\n\ - uint4 dst = convert_uint4(src * output_scale + output_zp);\n\ -\n\ - write_imageui(output, coord, dst);\n\ +#define BATCH_NORM_F32_SH_IMPL_2D(TYPE) \\\n\ +__kernel void batch_norm_F32to##TYPE##_2D \\\n\ + ( \\\n\ + __read_only image2d_t input, \\\n\ + __read_only image2d_t Mean, \\\n\ + __read_only image2d_t Variance, \\\n\ + __read_only image2d_t Gamma, \\\n\ + __read_only image2d_t Beta, \\\n\ + __write_only image2d_t output, \\\n\ + float eps, \\\n\ + float input_scale, \\\n\ + float input_tail, \\\n\ + float output_scale, \\\n\ + float output_zp \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + float4 src = read_imagef(input, coord); \\\n\ + float4 mean = read_imagef(Mean, coord); \\\n\ + float4 var = read_imagef(Variance, coord); \\\n\ + float4 gamma = read_imagef(Gamma, coord); \\\n\ + float4 beta = read_imagef(Beta, coord); \\\n\ + \\\n\ + src.x = src.x - mean.x; \\\n\ + float inv = rsqrt(var.x + eps); \\\n\ + src.x = src.x * inv *gamma.x + beta.x; \\\n\ + \\\n\ + BN_##TYPE##_SAVE \\\n\ }\n\ +BATCH_NORM_F32_SH_IMPL_2D(F32)\n\ +BATCH_NORM_F32_SH_IMPL_2D(U8)\n\ +BATCH_NORM_F32_SH_IMPL_2D(I32)\n\ \n\ -"; /* end of batchnorm_single_cl*/ +#define BATCH_NORM_U8_SH_IMPL(TYPE) \\\n\ +__kernel void batch_norm_U8to##TYPE \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_array_t Mean, \\\n\ + __read_only image2d_array_t Variance, \\\n\ + __read_only image2d_array_t Gamma, \\\n\ + __read_only image2d_array_t Beta, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, \\\n\ + float input_scale, \\\n\ + float input_tail, \\\n\ + float output_scale, \\\n\ + float output_zp \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + uint4 data; \\\n\ + float4 src, mean, var, gamma, beta; \\\n\ + READ_IMAGEUI_2DARRAY(data, input, coord); \\\n\ + READ_IMAGEF_2DARRAY(mean, Mean, coord); \\\n\ + READ_IMAGEF_2DARRAY(var, Variance, coord); \\\n\ + READ_IMAGEF_2DARRAY(gamma, Gamma, coord); \\\n\ + READ_IMAGEF_2DARRAY(beta, Beta, coord); \\\n\ + \\\n\ + src = convert_float4(data) * input_scale - input_tail; \\\n\ + src.x = src.x - mean.x; \\\n\ + float inv = rsqrt(var.x + eps); \\\n\ + src.x = src.x * inv *gamma.x + beta.x; \\\n\ + \\\n\ + BN_##TYPE##_SAVE \\\n\ +}\n\ +BATCH_NORM_U8_SH_IMPL(U8)\n\ +BATCH_NORM_U8_SH_IMPL(F32)\n\ +\n\ +#define BATCH_NORM_U8_SH_IMPL_2D(TYPE) \\\n\ +__kernel void batch_norm_U8to##TYPE##_2D \\\n\ + ( \\\n\ + __read_only image2d_t input, \\\n\ + __read_only image2d_t Mean, \\\n\ + __read_only image2d_t Variance, \\\n\ + __read_only image2d_t Gamma, \\\n\ + __read_only image2d_t Beta, \\\n\ + __write_only image2d_t output, \\\n\ + float eps, \\\n\ + float input_scale, \\\n\ + float input_tail, \\\n\ + float output_scale, \\\n\ + float output_zp \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + uint4 data = read_imageui(input, coord); \\\n\ + float4 mean = read_imagef(Mean, coord); \\\n\ + float4 var = read_imagef(Variance, coord); \\\n\ + float4 gamma = read_imagef(Gamma, coord); \\\n\ + float4 beta = read_imagef(Beta, coord); \\\n\ + \\\n\ + float4 src = convert_float4(data) * input_scale - input_tail; \\\n\ + src.x = src.x - mean.x; \\\n\ + float inv = rsqrt(var.x + eps); \\\n\ + src.x = src.x * inv *gamma.x + beta.x; \\\n\ + \\\n\ + BN_##TYPE##_SAVE \\\n\ +}\n\ +BATCH_NORM_U8_SH_IMPL_2D(U8)\n\ +BATCH_NORM_U8_SH_IMPL_2D(F32)\n\ +\n\ +#define BATCH_NORM_I32_SH_IMPL(TYPE) \\\n\ +__kernel void batch_norm_I32to##TYPE \\\n\ + ( \\\n\ + __read_only image2d_array_t input, \\\n\ + __read_only image2d_array_t Mean, \\\n\ + __read_only image2d_array_t Variance, \\\n\ + __read_only image2d_array_t Gamma, \\\n\ + __read_only image2d_array_t Beta, \\\n\ + __write_only image2d_array_t output, \\\n\ + float eps, \\\n\ + float input_scale, \\\n\ + float input_tail, \\\n\ + float output_scale, \\\n\ + float output_zp \\\n\ + ) \\\n\ +{ \\\n\ + int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\ + \\\n\ + int4 data; \\\n\ + float4 src, mean, var, gamma, beta; \\\n\ + READ_IMAGEI_2DARRAY(data, input, coord); \\\n\ + READ_IMAGEF_2DARRAY(mean, Mean, coord); \\\n\ + READ_IMAGEF_2DARRAY(var, Variance, coord); \\\n\ + READ_IMAGEF_2DARRAY(gamma, Gamma, coord); \\\n\ + READ_IMAGEF_2DARRAY(beta, Beta, coord); \\\n\ + \\\n\ + src = convert_float4(data) * input_scale - input_tail; \\\n\ + src.x = src.x - mean.x; \\\n\ + float inv = rsqrt(var.x + eps); \\\n\ + src.x = src.x * inv *gamma.x + beta.x; \\\n\ + \\\n\ + BN_##TYPE##_SAVE \\\n\ +}\n\ +BATCH_NORM_I32_SH_IMPL(I32)\n\ +BATCH_NORM_I32_SH_IMPL(F32)\n\ +\n\ +#define BATCH_NORM_I32_SH_IMPL_2D(TYPE) \\\n\ +__kernel void batch_norm_I32to##TYPE##_2D \\\n\ + ( \\\n\ + __read_only image2d_t input, \\\n\ + __read_only image2d_t Mean, \\\n\ + __read_only image2d_t Variance, \\\n\ + __read_only image2d_t Gamma, \\\n\ + __read_only image2d_t Beta, \\\n\ + __write_only image2d_t output, \\\n\ + float eps, \\\n\ + float input_scale, \\\n\ + float input_tail, \\\n\ + float output_scale, \\\n\ + float output_zp \\\n\ + ) \\\n\ +{ \\\n\ + int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\ + \\\n\ + int4 data = read_imagei(input, coord); \\\n\ + float4 mean = read_imagef(Mean, coord); \\\n\ + float4 var = read_imagef(Variance, coord); \\\n\ + float4 gamma = read_imagef(Gamma, coord); \\\n\ + float4 beta = read_imagef(Beta, coord); \\\n\ + \\\n\ + float4 src = convert_float4(data) * input_scale - input_tail; \\\n\ + src.x = src.x - mean.x; \\\n\ + float inv = rsqrt(var.x + eps); \\\n\ + src.x = src.x * inv *gamma.x + beta.x; \\\n\ + \\\n\ + BN_##TYPE##_SAVE \\\n\ +}\n\ +BATCH_NORM_I32_SH_IMPL_2D(I32)\n\ +BATCH_NORM_I32_SH_IMPL_2D(F32)"; /* end of batchnorm_single_cl*/ static const char cast_cl[] = "\n\ #define CAST_FUN(src_name, dst_name, src_type, dst_type, conv_fun, read_fun, write_fun) \\\n\ @@ -43235,20 +43300,23 @@ inline uchar* get_image_ptr_from_coord(Image img, int2 coord)\n\ \n\ inline Image create_image_from_image2d(image2d_t input, int stride_x)\n\ {\n\ + int stride_y;\n\ +#if (USE_40BITS_VA==0)\n\ int8 desc;\n\ +#else\n\ + int8 desc;\n\ + _viv_asm(GET_IMAGE_STRIDE, stride_y, input);\n\ +#endif\n\ _viv_asm(COPY, desc, input, sizeof(desc));\n\ + uint address = as_uint(desc.s0);\n\ \n\ #if (USE_40BITS_VA==0)\n\ - uint address = as_uint(desc.s0);\n\ - int stride_y = desc.s1;\n\ -#else\n\ - ulong address = as_ulong(desc.s05);\n\ - int stride_y = desc.s6;\n\ + stride_y = desc.s1;\n\ #endif\n\ \n\ Image img =\n\ {\n\ - .ptr = (uchar*)address,\n\ + .ptr = (uchar*)(uintptr_t)address,\n\ .stride_x = stride_x,\n\ .stride_y = stride_y\n\ };\n\ @@ -43271,28 +43339,23 @@ inline uchar* get_tensor_ptr_from_coord(Tensor t, int4 coord)\n\ \n\ inline Tensor create_tensor_from_image2d_array(image2d_array_t input, int stride_x)\n\ {\n\ -#if (USE_40BITS_VA==0)\n\ int8 desc;\n\ - _viv_asm(COPY, desc, input, sizeof(desc));\n\ -\n\ - uint address = as_uint(desc.s0);\n\ - int stride_y = desc.s1;\n\ - int stride_z = desc.s4;\n\ + int2 strides;\n\ +#if (USE_40BITS_VA==0)\n\ + strides.x = desc.s1;\n\ + strides.y = desc.s4;\n\ #else\n\ - int16 desc;\n\ - _viv_asm(COPY, desc, input, sizeof(desc));\n\ -\n\ - ulong address = as_ulong(desc.s05);\n\ - int stride_y = desc.s6;\n\ - int stride_z = desc.sa;\n\ + _viv_asm(GET_IMAGE_STRIDE, strides, input);\n\ #endif\n\ + _viv_asm(COPY, desc, input, sizeof(desc));\n\ + uint address = as_uint(desc.s0);\n\ \n\ Tensor t =\n\ {\n\ - .ptr = (uchar*)address,\n\ + .ptr = (uchar*)(uintptr_t)address,\n\ .stride_x = stride_x,\n\ - .stride_y = stride_y,\n\ - .stride_z = stride_z\n\ + .stride_y = strides.x,\n\ + .stride_z = strides.y\n\ };\n\ \n\ return t;\n\ @@ -50832,7 +50895,7 @@ __kernel void minimum_I32I32toI32_2D\n\ \n\ "; /* end of minimum_cl*/ -static const char moments_axis0_cl[] = "__kernel void moments_axis0_U8toF16(\n\ +static const char moments_axis0_cl[] = "__kernel void moments_axis0_U8toF32(\n\ __read_only image2d_array_t input,\n\ __write_only image2d_t output_mean,\n\ __write_only image2d_t output_vari,\n\ @@ -50863,8 +50926,8 @@ static const char moments_axis0_cl[] = "__kernel void moments_axis0_U8toF16(\n\ tmpSum += (data);\n\ tmpSqr += (data * data);\n\ }\n\ - sqr = convert_float(tmpSqr - 2 * input_zp * tmpSum + width * input_zp * input_zp) * e2InScale;\n\ - sum = convert_float(tmpSum - width * input_zp) * input_scale;\n\ + sqr = convert_float(as_int(tmpSqr - 2 * input_zp * tmpSum + width * input_zp * input_zp)) * e2InScale;\n\ + sum = convert_float(as_int(tmpSum - width * input_zp)) * input_scale;\n\ }\n\ float4 mean, vari;\n\ mean.x = sum * dimRatio;\n\ @@ -50909,7 +50972,6 @@ __kernel void moments_axis0_##src0_type_name##to##src0_type_name( \\\n\ write_imagef(output_mean, coord_out, mean); \\\n\ write_imagef(output_vari, coord_out, vari); \\\n\ }\n\ -MOMENTS_AXIS0_F(F16)\n\ MOMENTS_AXIS0_F(F32)\n\ \n\ __kernel void moments_axis0_I32toF32(\n\ @@ -50930,20 +50992,21 @@ __kernel void moments_axis0_I32toF32(\n\ int gidz = get_global_id(1);\n\ \n\ int4 coord0 = (int4)(0, gidy, gidz, 0);\n\ - int data;\n\ - int sum = 0, sqr = 0;\n\ + float data;\n\ + float sum = 0, sqr = 0;\n\ \n\ for(coord0.x = 0; coord0.x < width;)\n\ {\n\ - data = read_imagei(input, coord0).x;\n\ + data = convert_float(read_imagei(input, coord0).x);\n\ coord0.x++;\n\ - sum += (data);\n\ - sqr += (data * data);\n\ +\n\ + sum = sum + data;\n\ + sqr = sqr + data * data;\n\ }\n\ \n\ float4 mean, vari;\n\ - mean.x = sum * dimRatio;\n\ - vari.x = sqr * dimRatio;\n\ + mean.x = sum * dimRatio * input_scale;\n\ + vari.x = sqr * dimRatio * input_scale * input_scale;\n\ vari.x = vari.x - mean.x * mean.x;\n\ \n\ int2 coord_out = (int2)(gidy, gidz);\n\ @@ -50951,7 +51014,7 @@ __kernel void moments_axis0_I32toF32(\n\ write_imagef(output_vari, coord_out, vari);\n\ }"; /* end of moments_axis0_cl*/ -static const char moments_axis01_cl[] = "__kernel void moments_axis01_U8toF16(\n\ +static const char moments_axis01_cl[] = "__kernel void moments_axis01_U8toF32(\n\ image2d_array_t input, image2d_t output_mean, image2d_t output_vari,\n\ int axis, int axis_num, int input_zp, float input_scale,\n\ int width, int height, int chn, float dimRatio\n\ @@ -51065,7 +51128,6 @@ __kernel void moments_axis01_##src0_type_name##to##src0_type_name( \\\n\ write_imagef(output_vari, coord_out, vari); \\\n\ } \\\n\ }\n\ -MOMENTS_AXIS01_F(F16)\n\ MOMENTS_AXIS01_F(F32)\n\ \n\ __kernel void moments_axis01_I32toF32(\n\ @@ -51079,7 +51141,7 @@ __kernel void moments_axis01_I32toF32(\n\ int lidx = get_local_id(0);\n\ \n\ int4 coord = (int4)(gidx, 0, gidz, 0);\n\ - int4 data;\n\ + float4 data;\n\ float sum = 0, sqr = 0;\n\ float e2InScale = input_scale * input_scale;\n\ \n\ @@ -51088,13 +51150,14 @@ __kernel void moments_axis01_I32toF32(\n\ \n\ for(coord.x = gidx; coord.x < width; coord.x += 16)\n\ {\n\ - int tmpSum = 0, tmpSqr = 0;\n\ + float tmpSum = 0, tmpSqr = 0;\n\ for(coord.y = 0; coord.y < height;)\n\ {\n\ - data = read_imagei(input, coord);\n\ + data = convert_float4(read_imagei(input, coord));\n\ coord.y++;\n\ - tmpSum += data.x;\n\ - tmpSqr += data.x * data.x;\n\ +\n\ + tmpSum = tmpSum + data.x;\n\ + tmpSqr = tmpSqr + data.x * data.x;\n\ }\n\ sqr += (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;\n\ sum += (tmpSum - height * input_zp) * input_scale;\n\ @@ -51127,7 +51190,7 @@ __kernel void moments_axis01_I32toF32(\n\ }\n\ "; /* end of moments_axis01_cl*/ -static const char moments_axis012_cl[] = "__kernel void moments_axis012_U8toF16(\n\ +static const char moments_axis012_cl[] = "__kernel void moments_axis012_U8toF32(\n\ image2d_array_t input, image2d_t output_mean, image2d_t output_vari,\n\ int axis, int axis_num, int input_zp, float input_scale,\n\ int width, int height, int chn, float dimRatio\n\ @@ -51245,7 +51308,6 @@ __kernel void moments_axis012_##src0_type_name##to##src0_type_name( \\\n\ write_imagef(output_vari, coord_out, vari); \\\n\ } \\\n\ }\n\ -MOMENTS_AXIS012_F(F16)\n\ MOMENTS_AXIS012_F(F32)\n\ \n\ __kernel void moments_axis012_I32toF32(\n\ @@ -51274,8 +51336,8 @@ __kernel void moments_axis012_I32toF32(\n\ {\n\ data = read_imagei(input, coord);\n\ coord.y++;\n\ - tmpSum += data.x;\n\ - tmpSqr += data.x * data.x;\n\ + tmpSum = tmpSum + data.x;\n\ + tmpSqr = tmpSqr + data.x * data.x;\n\ }\n\ sqr += (tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;\n\ sum += (tmpSum - height * input_zp) * input_scale;\n\ @@ -51309,7 +51371,7 @@ __kernel void moments_axis012_I32toF32(\n\ }\n\ "; /* end of moments_axis012_cl*/ -static const char moments_axis1_cl[] = "__kernel void moments_axis1_U8toF16(\n\ +static const char moments_axis1_cl[] = "__kernel void moments_axis1_U8toF32(\n\ __read_only image2d_array_t input,\n\ __write_only image2d_t output_mean,\n\ __write_only image2d_t output_vari,\n\ @@ -51334,8 +51396,8 @@ static const char moments_axis1_cl[] = "__kernel void moments_axis1_U8toF16(\n\ tmpSum += (data);\n\ tmpSqr += (data * data);\n\ }\n\ - sqr = convert_float(tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp) * e2InScale;\n\ - sum = convert_float(tmpSum - height * input_zp) * input_scale;\n\ + sqr = convert_float(as_int(tmpSqr - 2 * input_zp * tmpSum + height * input_zp * input_zp)) * e2InScale;\n\ + sum = convert_float(as_int(tmpSum - height * input_zp)) * input_scale;\n\ }\n\ \n\ float4 mean, vari;\n\ @@ -51381,7 +51443,6 @@ __kernel void moments_axis1_##src0_type_name##to##src0_type_name( \\\n\ write_imagef(output_mean, coord_out, mean); \\\n\ write_imagef(output_vari, coord_out, vari); \\\n\ }\n\ -MOMENTS_AXIS1_F(F16)\n\ MOMENTS_AXIS1_F(F32)\n\ \n\ __kernel void moments_axis1_I32toF32(\n\ @@ -51402,20 +51463,20 @@ __kernel void moments_axis1_I32toF32(\n\ int gidz = get_global_id(1);\n\ \n\ int4 coord0 = (int4)(gidx, 0, gidz, 0);\n\ - int data;\n\ - int sum = 0, sqr = 0;\n\ + float data;\n\ + float sum = 0, sqr = 0;\n\ \n\ for(coord0.y = 0; coord0.y < height;)\n\ {\n\ - data = read_imagei(input, coord0).x;\n\ + data = convert_float(read_imagei(input, coord0).x);\n\ coord0.y++;\n\ - sum += (data);\n\ - sqr += (data * data);\n\ + sum = sum + data;\n\ + sqr = sqr + data * data;\n\ }\n\ \n\ float4 mean, vari;\n\ - mean.x = sum * dimRatio;\n\ - vari.x = sqr * dimRatio;\n\ + mean.x = sum * dimRatio * input_scale;\n\ + vari.x = sqr * dimRatio * input_scale * input_scale;\n\ vari.x = vari.x - mean.x * mean.x;\n\ \n\ int2 coord_out = (int2)(gidx, gidz);\n\ @@ -51423,7 +51484,7 @@ __kernel void moments_axis1_I32toF32(\n\ write_imagef(output_vari, coord_out, vari);\n\ }"; /* end of moments_axis1_cl*/ -static const char moments_axis2_cl[] = "__kernel void moments_axis2_U8toF16(\n\ +static const char moments_axis2_cl[] = "__kernel void moments_axis2_U8toF32(\n\ __read_only image2d_array_t input,\n\ __write_only image2d_t output_mean,\n\ __write_only image2d_t output_vari,\n\ @@ -51454,12 +51515,12 @@ static const char moments_axis2_cl[] = "__kernel void moments_axis2_U8toF16(\n\ tmpSum += (data);\n\ tmpSqr += (data * data);\n\ }\n\ - sqr = (tmpSqr - 2 * input_zp * tmpSum + chn * input_zp * input_zp) * e2InScale;\n\ - sum = (tmpSum - chn * input_zp) * input_scale;\n\ + sqr = as_int(tmpSqr - 2 * input_zp * tmpSum + chn * input_zp * input_zp) * e2InScale;\n\ + sum = tmpSum * input_scale;\n\ }\n\ \n\ float4 mean, vari;\n\ - mean.x = sum * dimRatio;\n\ + mean.x = sum * dimRatio - input_zp * input_scale;\n\ vari.x = sqr * dimRatio;\n\ vari.x = vari.x - mean.x * mean.x;\n\ \n\ @@ -51507,7 +51568,6 @@ __kernel void moments_axis2_##src0_type_name##to##src0_type_name( \\\n\ write_imagef(output_mean, coord_out, mean); \\\n\ write_imagef(output_vari, coord_out, vari); \\\n\ }\n\ -MOMENTS_AXIS2_F(F16)\n\ MOMENTS_AXIS2_F(F32)\n\ \n\ __kernel void moments_axis2_I32toF32(\n\ @@ -51528,20 +51588,22 @@ __kernel void moments_axis2_I32toF32(\n\ int gidy = get_global_id(1);\n\ \n\ int4 coord0 = (int4)(gidx, gidy, 0, 0);\n\ - int data;\n\ - int sum = 0, sqr = 0;\n\ + float data;\n\ + float sum = 0, sqr = 0;\n\ \n\ for(coord0.z = 0; coord0.z < chn;)\n\ {\n\ - data = read_imagei(input, coord0).x;\n\ + data = convert_float(read_imagei(input, coord0).x);\n\ coord0.z++;\n\ - sum += (data);\n\ - sqr += (data * data);\n\ +\n\ +\n\ + sum = sum + data;\n\ + sqr = sqr + data * data;\n\ }\n\ \n\ float4 mean, vari;\n\ - mean.x = sum * dimRatio;\n\ - vari.x = sqr * dimRatio;\n\ + mean.x = sum * dimRatio * input_scale;\n\ + vari.x = sqr * dimRatio * input_scale * input_scale;\n\ vari.x = vari.x - mean.x * mean.x;\n\ \n\ int2 coord_out = (int2)(gidx, gidy);\n\ @@ -54857,8 +54919,8 @@ static const char select_cl[] = "__kernel void select_I8_U8_U8toU8(\n\ uint4 src0, src1, src, dst;\n\ float inputScale, inputTail;\n\ READ_IMAGEI_2DARRAY(value, condition, coord);\n\ - READ_IMAGEF_2DARRAY(src0, input0, coord);\n\ - READ_IMAGEF_2DARRAY(src1, input1, coord);\n\ + READ_IMAGEUI_2DARRAY(src0, input0, coord);\n\ + READ_IMAGEUI_2DARRAY(src1, input1, coord);\n\ src = (value != 0 ? src0 : src1);\n\ inputScale = (value.x != 0 ? input0Scale : input1Scale);\n\ inputTail = (value.x != 0 ? input0Tail : input1Tail);\n\ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_a_times_b_plus_c.c b/src/tim/vx/internal/src/ops/vsi_nn_op_a_times_b_plus_c.c index 96e084f..1f371d4 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_a_times_b_plus_c.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_a_times_b_plus_c.c @@ -73,7 +73,7 @@ static vsi_bool op_check vsi_bool ret = FALSE; memset(&attr, 0, sizeof(attr)); - memcpy(attr.size, outputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof( uint32_t )); + memcpy(attr.size, outputs[0]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); attr.dim_num = outputs[0]->attr.dim_num; attr.vtl = TRUE; attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c b/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c index 220e778..d4cf2ae 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_abs.c @@ -43,7 +43,7 @@ static vsi_status op_compute ) { vsi_status status; - int32_t input_size[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t input_size[VSI_NN_MAX_DIM_NUM] = {0}; uint32_t dims = 0; vx_tensor input = NULL, input0 = NULL; vx_tensor output = NULL, output0 = NULL; @@ -51,12 +51,17 @@ static vsi_status op_compute if (inputs[0]->attr.dim_num > 4) { - input_size[0] = vsi_nn_GetElementNum(inputs[0]) / + input_size[0] = (int32_t)vsi_nn_GetElementNum(inputs[0]) / inputs[0]->attr.size[inputs[0]->attr.dim_num - 1]; input_size[1] = inputs[0]->attr.size[inputs[0]->attr.dim_num - 1]; dims= 2; +#ifdef VSI_40BIT_VA_SUPPORT input = vxReshapeTensor(inputs[0]->t, input_size, dims); output = vxReshapeTensor(outputs[0]->t, input_size, dims); +#else + input = vxReshapeTensor(inputs[0]->t, (vx_int32*)input_size, (vx_uint32)dims); + output = vxReshapeTensor(outputs[0]->t, (vx_int32*)input_size, (vx_uint32)dims); +#endif input0 = input; output0 = output; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c index c96a2c8..8a54e35 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c @@ -47,7 +47,7 @@ static vsi_status _argmaxmin_op_compute { vsi_status status; vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; - int32_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } }; uint32_t rank_in = 0; uint32_t rank_out = 0; int32_t axis = 0; @@ -77,9 +77,9 @@ static vsi_status _argmaxmin_op_compute // TODO: This optimzie is a hack for gpu path, // it should be moved to gpu kernel setup. ret = vsi_nn_kernel_optimize_reduce_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[0]->attr.size, inputs[0]->attr.dim_num, &axis, 1, - (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + outputs[0]->attr.size, outputs[0]->attr.dim_num, shapes[0], &rank_in, shapes[1], &rank_out, &new_axis, &axis_size); @@ -89,9 +89,9 @@ static vsi_status _argmaxmin_op_compute if( ret ) { reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, - inputs[0], (uint32_t*)shapes[0], rank_in ); + inputs[0], shapes[0], rank_in ); reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, - outputs[0], (uint32_t*)shapes[1], rank_out ); + outputs[0], shapes[1], rank_out ); self->n = (vx_node)vsi_nn_kernel_selector( self->graph, kernel_name, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batch2space.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batch2space.c index 0865f49..24a3d14 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_batch2space.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batch2space.c @@ -103,24 +103,7 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { - BEGIN_IO_TYPE_DECL(BATCH2SPACE, 1, 1) - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F32, D_BF16) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_BF16, D_BF16) - IO_TYPE(D_BF16, D_F32) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - END_IO_TYPE_DECL(BATCH2SPACE) - if (!VALIDATE_OP_IO_TYPES(BATCH2SPACE, self, inputs, self->input.num, outputs, self->output.num)) - { - char* desc = generate_op_io_types_desc(inputs, - self->input.num, outputs, self->output.num); - VSILOGE("Inputs/Outputs data type not support: %s", desc); - destroy_op_io_types_desc(desc); - return FALSE; - } + vsi_bool ret = FALSE; if (inputs[0]->attr.dim_num != 4) { @@ -135,7 +118,9 @@ static vsi_bool op_check return FALSE; } - return TRUE; + ret = vsi_nn_OpCheck(VSI_NN_OP_STRIDED_SLICE, self, inputs, outputs); + + return ret; } /* op_check() */ static vsi_bool op_setup @@ -196,4 +181,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c index ed63df6..2e67b83 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batch_norm.c @@ -177,10 +177,10 @@ static vsi_status _dynamic_batchnorm { vsi_status status = VSI_FAILURE; vsi_nn_kernel_param_t * param = NULL; - int32_t shapes[4][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; - int32_t* shapes_ptr[4] = {NULL}; - int32_t *shapes_in[3] = {NULL}; - size_t rank_in[3] = {0}; + vsi_size_t shapes[4][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + vsi_size_t* shapes_ptr[4] = {NULL}; + vsi_size_t *shapes_in[3] = {NULL}; + vsi_size_t rank_in[3] = {0}; uint32_t new_rank = 0; vsi_nn_tensor_t* reshape_tensors[6] = { NULL }; vsi_bool ret = TRUE; @@ -189,37 +189,37 @@ static vsi_status _dynamic_batchnorm param = vsi_nn_kernel_param_create(); vsi_nn_kernel_param_add_float32( param, "eps", self->nn_param.batch_norm.eps ); - rank_in[0] = (size_t)inputs[0]->attr.dim_num; - rank_in[1] = (size_t)inputs[1]->attr.dim_num; - rank_in[2] = (size_t)inputs[3]->attr.dim_num; - shapes_in[0] = (int32_t *)inputs[0]->attr.size; - shapes_in[1] = (int32_t *)inputs[1]->attr.size; - shapes_in[2] = (int32_t *)inputs[3]->attr.size; + rank_in[0] = (vsi_size_t)inputs[0]->attr.dim_num; + rank_in[1] = (vsi_size_t)inputs[1]->attr.dim_num; + rank_in[2] = (vsi_size_t)inputs[3]->attr.dim_num; + shapes_in[0] = inputs[0]->attr.size; + shapes_in[1] = inputs[1]->attr.size; + shapes_in[2] = inputs[3]->attr.size; for (i = 0; i < 4; i++) { shapes_ptr[i] = shapes[i]; } ret = vsi_nn_kernel_optimize_broadcast_shape( - (const int32_t**)shapes_in, (const size_t*)rank_in, 3, - (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + (const vsi_size_t**)shapes_in, rank_in, 3, + outputs[0]->attr.size, outputs[0]->attr.dim_num, shapes_ptr, shapes[3], &new_rank); if( ret ) { reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, - inputs[0], (uint32_t*)shapes[0], new_rank ); + inputs[0], shapes[0], new_rank ); reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, - inputs[1], (uint32_t*)shapes[1], new_rank ); + inputs[1], shapes[1], new_rank ); reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, - inputs[2], (uint32_t*)shapes[1], new_rank ); + inputs[2], shapes[1], new_rank ); reshape_tensors[3] = vsi_nn_reshape_tensor( self->graph, - inputs[3], (uint32_t*)shapes[2], new_rank ); + inputs[3], shapes[2], new_rank ); reshape_tensors[4] = vsi_nn_reshape_tensor( self->graph, - inputs[4], (uint32_t*)shapes[2], new_rank ); + inputs[4], shapes[2], new_rank ); reshape_tensors[5] = vsi_nn_reshape_tensor( self->graph, - outputs[0], (uint32_t*)shapes[3], new_rank ); + outputs[0], shapes[3], new_rank ); } else { @@ -287,7 +287,7 @@ static vsi_status op_optimize { uint32_t dim = 0; vsi_nn_batcnnorm_lcl_data *local = NULL; - uint32_t shape[VSI_NN_MAX_DIM_NUM]; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM]; char tensor_name[128]; dim = inputs[0]->attr.dim_num; @@ -367,12 +367,12 @@ static vsi_bool _dynamic_check for(i = 0; i < rank; i++) { - vx_int32 shape0 = inputs[0]->attr.size[i]; + vx_int32 shape0 = (int32_t)(inputs[0]->attr.size[i]); for ( j = 1; j < self->input.num; j++) { uint32_t rank1 = inputs[j]->attr.dim_num; - vx_int32 shape1 = rank1 > i ? inputs[j]->attr.size[i] : 1; + vx_int32 shape1 = (int32_t)(rank1 > i ? inputs[j]->attr.size[i] : 1); if(shape0 != shape1 && shape1 != 1) { @@ -452,7 +452,7 @@ static vsi_bool op_setup { outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; memcpy( outputs[0]->attr.size, inputs[0]->attr.size, - VSI_NN_MAX_DIM_NUM * sizeof( uint32_t ) ); + VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) ); } if(_is_3d_batchnorm(self, inputs)) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c b/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c index a773a5b..d0ba47d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_batchnorm_single.c @@ -53,10 +53,10 @@ static vsi_status op_compute { vsi_status status = VSI_FAILURE; vsi_nn_kernel_param_t * param = NULL; - int32_t shapes[4][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; - int32_t* shapes_ptr[4] = {NULL}; - int32_t *shapes_in[3] = {NULL}; - size_t rank_in[3] = {0}; + vsi_size_t shapes[4][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + vsi_size_t* shapes_ptr[4] = {NULL}; + vsi_size_t *shapes_in[3] = {NULL}; + vsi_size_t rank_in[3] = {0}; uint32_t new_rank = 0; vsi_nn_tensor_t* reshape_tensors[6] = { NULL }; vsi_bool ret = TRUE; @@ -65,37 +65,37 @@ static vsi_status op_compute param = vsi_nn_kernel_param_create(); vsi_nn_kernel_param_add_float32( param, "eps", self->nn_param.batch_norm.eps ); - rank_in[0] = (size_t)inputs[0]->attr.dim_num; - rank_in[1] = (size_t)inputs[1]->attr.dim_num; - rank_in[2] = (size_t)inputs[3]->attr.dim_num; - shapes_in[0] = (int32_t *)inputs[0]->attr.size; - shapes_in[1] = (int32_t *)inputs[1]->attr.size; - shapes_in[2] = (int32_t *)inputs[3]->attr.size; + rank_in[0] = (vsi_size_t)inputs[0]->attr.dim_num; + rank_in[1] = (vsi_size_t)inputs[1]->attr.dim_num; + rank_in[2] = (vsi_size_t)inputs[3]->attr.dim_num; + shapes_in[0] = inputs[0]->attr.size; + shapes_in[1] = inputs[1]->attr.size; + shapes_in[2] = inputs[3]->attr.size; for (i = 0; i < 4; i++) { shapes_ptr[i] = shapes[i]; } ret = vsi_nn_kernel_optimize_broadcast_shape( - (const int32_t**)shapes_in, (const size_t*)rank_in, 3, - (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + (const vsi_size_t**)shapes_in, rank_in, 3, + outputs[0]->attr.size, outputs[0]->attr.dim_num, shapes_ptr, shapes[3], &new_rank); if( ret ) { reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, - inputs[0], (uint32_t*)shapes[0], new_rank ); + inputs[0], shapes[0], new_rank ); reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, - inputs[1], (uint32_t*)shapes[1], new_rank ); + inputs[1], shapes[1], new_rank ); reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, - inputs[2], (uint32_t*)shapes[1], new_rank ); + inputs[2], shapes[1], new_rank ); reshape_tensors[3] = vsi_nn_reshape_tensor( self->graph, - inputs[3], (uint32_t*)shapes[2], new_rank ); + inputs[3], shapes[2], new_rank ); reshape_tensors[4] = vsi_nn_reshape_tensor( self->graph, - inputs[4], (uint32_t*)shapes[2], new_rank ); + inputs[4], shapes[2], new_rank ); reshape_tensors[5] = vsi_nn_reshape_tensor( self->graph, - outputs[0], (uint32_t*)shapes[3], new_rank ); + outputs[0], shapes[3], new_rank ); } else { @@ -165,12 +165,20 @@ static vsi_bool op_check IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F32, D_F32, D_U8|Q_ASYM) IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16, D_F32, D_F32, D_F16) IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_F32, D_F32, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F32, D_F32, D_F32, D_I8|Q_DFP) IO_TYPE(D_I8|Q_DFP, D_F16, D_F16, D_F32, D_F32, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F32, D_F32, D_F32, D_F16) IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F32, D_F32, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_F16, D_F16, D_F32, D_F32, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_F32, D_F32, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_F32, D_F32, D_F16) IO_TYPE(D_F32, D_F32, D_F32, D_F32, D_F32, D_F32) IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_F16) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_F16) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F32, D_F32, D_F32, D_F32, D_I8|Q_DFP) END_IO_TYPE_DECL(BATCHNORM_SINGLE) if(!VALIDATE_OP_IO_TYPES(BATCHNORM_SINGLE, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, @@ -182,16 +190,16 @@ static vsi_bool op_check for(i = 0; i < rank; i++) { - vx_int32 shape0 = inputs[0]->attr.size[i]; + vsi_size_t shape0 = inputs[0]->attr.size[i]; for ( j = 1; j < self->input.num; j++) { uint32_t rank1 = inputs[j]->attr.dim_num; - vx_int32 shape1 = rank1 > i ? inputs[j]->attr.size[i] : 1; + vsi_size_t shape1 = rank1 > i ? inputs[j]->attr.size[i] : 1; if(shape0 != shape1 && shape1 != 1) { - VSILOGE("Invalid broadcast for inputs[%d] size[%u]", j, shape1); + VSILOGE("Invalid broadcast for inputs[%d] size[%"VSI_SIZE_T_SPECIFIER"]", j, shape1); return FALSE; } } @@ -211,7 +219,7 @@ static vsi_bool op_setup { outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; memcpy( outputs[0]->attr.size, inputs[0]->attr.size, - inputs[0]->attr.dim_num * sizeof( uint32_t ) ); + inputs[0]->attr.dim_num * sizeof(vsi_size_t) ); } return TRUE; @@ -234,4 +242,3 @@ DEF_OP_REG ); __END_DECLS - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c index fdc508b..abe9d4a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_lstm.c @@ -50,9 +50,9 @@ static vsi_bool setup_op_shapes &self->nn_param.bidirectional_sequence_lstm; vsi_nn_tensor_attr_t attr; vsi_nn_internal_tensor_t* output_tensor = NULL; - uint32_t num_units = 0; - uint32_t output_size = 0; - uint32_t batch_size = 0; + vsi_size_t num_units = 0; + vsi_size_t output_size = 0; + vsi_size_t batch_size = 0; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); if( curr_param->time_major ) @@ -180,9 +180,9 @@ static vsi_bool op_setup vsi_nn_tensor_t** aux_reshape_output_tensors = NULL; vsi_bool has_aux_input = (inputs[BI_LSTM_AUX_INPUT] != NULL); vsi_bool use_virtual_tensor = TRUE; - uint32_t batch_size = 0; + vsi_size_t batch_size = 0; uint32_t time_step = 0; - uint32_t i = 0; + vsi_size_t i = 0; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_node_wksp( self ); @@ -190,12 +190,12 @@ static vsi_bool op_setup if( curr_param->time_major ) { batch_size = inputs[BI_LSTM_INPUT_INPUT]->attr.size[1]; - time_step = inputs[BI_LSTM_INPUT_INPUT]->attr.size[2]; + time_step = (uint32_t)(inputs[BI_LSTM_INPUT_INPUT]->attr.size[2]); } else { batch_size = inputs[BI_LSTM_INPUT_INPUT]->attr.size[2]; - time_step = inputs[BI_LSTM_INPUT_INPUT]->attr.size[1]; + time_step = (uint32_t)(inputs[BI_LSTM_INPUT_INPUT]->attr.size[1]); } setup_op_shapes( self, inputs, outputs); @@ -260,14 +260,14 @@ static vsi_bool op_setup { /* reshape for split output */ output_tensor = vsi_nn_rnn_reshape_split_output(self, - split_output_tensors[i], batch_size, use_virtual_tensor); + split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor); reshape_output_tensors[i] = output_tensor->t; if (has_aux_input) { /* reshape for aux split output */ output_tensor = vsi_nn_rnn_reshape_split_output(self, - aux_split_output_tensors[i], batch_size, use_virtual_tensor); + aux_split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor); aux_reshape_output_tensors[i] = output_tensor->t; } } @@ -366,7 +366,7 @@ static vsi_bool op_setup /* reshape output to 3-dims */ output_tensor = vsi_nn_rnn_reshape_cell_output(self, - lstmcell_out0, batch_size, use_virtual_tensor); + lstmcell_out0, (uint32_t)batch_size, use_virtual_tensor); lstmcell_reshape_output_tensors_fw[i] = output_tensor->t; } @@ -466,7 +466,7 @@ static vsi_bool op_setup /* reshape output to 3-dims */ output_tensor = vsi_nn_rnn_reshape_cell_output(self, - lstmcell_out0, batch_size, use_virtual_tensor); + lstmcell_out0, (uint32_t)batch_size, use_virtual_tensor); lstmcell_reshape_output_tensors_bw[i] = output_tensor->t; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c index cb85606..46eea58 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bidirectional_sequence_rnn.c @@ -50,9 +50,9 @@ static vsi_bool setup_op_shapes &self->nn_param.bidirectional_sequence_rnn; vsi_nn_tensor_attr_t attr; vsi_nn_internal_tensor_t* output_tensor = NULL; - uint32_t num_units = 0; - uint32_t output_size = 0; - uint32_t batch_size = 0; + vsi_size_t num_units = 0; + vsi_size_t output_size = 0; + vsi_size_t batch_size = 0; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); if( curr_param->time_major ) @@ -179,9 +179,9 @@ static vsi_bool op_setup vsi_nn_tensor_t** aux_reshape_output_tensors = NULL; vsi_bool has_aux_input = (inputs[BI_RNN_AUX_INPUT] != NULL); vsi_bool use_virtual_tensor = TRUE; - uint32_t batch_size = 0; - uint32_t time_step = 0; - uint32_t i = 0; + vsi_size_t batch_size = 0; + vsi_size_t time_step = 0; + vsi_size_t i = 0; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_node_wksp( self ); @@ -229,9 +229,9 @@ static vsi_bool op_setup memset( reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); vsi_nn_rnn_split_input_tensor(self, input_tensor, - split_output_tensors, time_step, use_virtual_tensor); + split_output_tensors, (uint32_t)time_step, use_virtual_tensor); - vsi_nn_rnn_data_check_aligned(self, split_output_tensors, time_step, use_virtual_tensor); + vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor); /* split aux input tensor */ if(has_aux_input) @@ -242,9 +242,9 @@ static vsi_bool op_setup memset( aux_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); vsi_nn_rnn_split_input_tensor(self, aux_input_tensor, - aux_split_output_tensors, time_step, use_virtual_tensor); + aux_split_output_tensors, (uint32_t)time_step, use_virtual_tensor); - vsi_nn_rnn_data_check_aligned(self, aux_split_output_tensors, time_step, use_virtual_tensor); + vsi_nn_rnn_data_check_aligned(self, aux_split_output_tensors, (uint32_t)time_step, use_virtual_tensor); } /* prepare output tensor */ @@ -259,14 +259,14 @@ static vsi_bool op_setup { /* reshape for split output */ output_tensor = vsi_nn_rnn_reshape_split_output(self, - split_output_tensors[i], batch_size, use_virtual_tensor); + split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor); reshape_output_tensors[i] = output_tensor->t; if (has_aux_input) { /* reshape for aux split output */ output_tensor = vsi_nn_rnn_reshape_split_output(self, - aux_split_output_tensors[i], batch_size, use_virtual_tensor); + aux_split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor); aux_reshape_output_tensors[i] = output_tensor->t; } } @@ -323,7 +323,7 @@ static vsi_bool op_setup /* reshape output to 3-dims */ output_tensor = vsi_nn_rnn_reshape_cell_output(self, - rnncell_out0, batch_size, use_virtual_tensor); + rnncell_out0, (uint32_t)batch_size, use_virtual_tensor); rnncell_reshape_output_tensors_fw[i] = output_tensor->t; } @@ -379,7 +379,7 @@ static vsi_bool op_setup /* reshape output to 3-dims */ output_tensor = vsi_nn_rnn_reshape_cell_output(self, - rnncell_out0, batch_size, use_virtual_tensor); + rnncell_out0, (uint32_t)batch_size, use_virtual_tensor); rnncell_reshape_output_tensors_bw[i] = output_tensor->t; } @@ -417,7 +417,7 @@ static vsi_bool op_setup /* concat rnncell output, the rnn's output is 3-dims */ - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 ); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 ); curr->node->nn_param.concat.axis = 2; for( i = 0; i < time_step; i++ ) { @@ -448,7 +448,7 @@ static vsi_bool op_setup } /* concat rnncell output, the rnn's output is 3-dims */ - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 ); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 ); curr->node->nn_param.concat.axis = 2; for( i = 0; i < time_step; i++ ) { @@ -476,7 +476,7 @@ static vsi_bool op_setup } /* concat rnncell output, the rnn's output is 3-dims */ - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 ); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 ); curr->node->nn_param.concat.axis = 2; for( i = 0; i < time_step; i++ ) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c index 5189994..4ffe7ed 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_cast.c @@ -91,8 +91,8 @@ static vsi_status op_compute else { vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; - int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; - int32_t new_rank = 0; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t new_rank = 0; vsi_bool ret; if ( NULL == self ) @@ -101,14 +101,14 @@ static vsi_status op_compute } ret = vsi_nn_kernel_optimize_element_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank ); if ( ret ) { reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, - inputs[0], (uint32_t*)shape, new_rank ); + inputs[0], shape, new_rank ); reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, - outputs[0], (uint32_t*)shape, new_rank ); + outputs[0], shape, new_rank ); self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "cast", diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c index 9c151f6..6a43126 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_clip.c @@ -64,15 +64,15 @@ static vsi_status op_compute else { vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; - int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; - int32_t new_rank = 0; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t new_rank = 0; vsi_bool ret; vsi_nn_kernel_param_t * param = NULL; param =vsi_nn_kernel_param_create(); ret = vsi_nn_kernel_optimize_element_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank ); vsi_nn_kernel_param_add_float32( param, "min_value", min_value ); @@ -81,9 +81,9 @@ static vsi_status op_compute if( ret ) { reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, - inputs[0], (uint32_t*)shape, new_rank ); + inputs[0], shape, new_rank ); reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, - outputs[0], (uint32_t*)shape, new_rank ); + outputs[0], shape, new_rank ); self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "clip", diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_common.c b/src/tim/vx/internal/src/ops/vsi_nn_op_common.c index 648b2e1..1315bd7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_common.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_common.c @@ -67,7 +67,7 @@ vsi_bool vsi_nn_op_common_setup { outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; memcpy( outputs[0]->attr.size, inputs[0]->attr.size, - VSI_NN_MAX_DIM_NUM * sizeof( uint32_t ) ); + VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) ); } return TRUE; } /* op_common_setup() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c index 0f7abb0..8b8f058 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c @@ -372,7 +372,7 @@ static vsi_bool op_setup } axis = self->nn_param.concat.axis; memcpy( outputs[0]->attr.size, inputs[0]->attr.size, - sizeof( uint32_t ) * VSI_NN_MAX_DIM_NUM ); + sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM ); outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; for( i = 1; i < num; i ++ ) { @@ -393,8 +393,8 @@ static vsi_status op_optimize int32_t num,i; uint32_t axis; vx_tensor in_view_tensor; - uint32_t start[VSI_NN_MAX_DIM_NUM] = { 0 }; - uint32_t end[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t start[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t end[VSI_NN_MAX_DIM_NUM] = { 0 }; status = VSI_SUCCESS; /* we don't create tensor view if the axis is not the highest dimension */ @@ -424,8 +424,8 @@ static vsi_status op_optimize } /* Create tensor from view */ - memset( start, 0, sizeof( uint32_t ) * VSI_NN_MAX_DIM_NUM ); - memset( end, 0, sizeof( uint32_t ) * VSI_NN_MAX_DIM_NUM ); + memset( start, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM ); + memset( end, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM ); end[0] = inputs[0]->attr.size[0]; end[1] = inputs[0]->attr.size[1]; end[2] = inputs[0]->attr.size[2]; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_concatshift.c b/src/tim/vx/internal/src/ops/vsi_nn_op_concatshift.c index 880d2d2..c32d5da 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_concatshift.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_concatshift.c @@ -151,8 +151,8 @@ static vsi_status op_optimize vsi_status status = VSI_SUCCESS; uint32_t axis; vx_tensor out_view_tensor; - uint32_t start[VSI_NN_MAX_DIM_NUM] = { 0 }; - uint32_t end[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t start[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t end[VSI_NN_MAX_DIM_NUM] = { 0 }; uint32_t i = 0; uint32_t keep_size = 0; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c index f902832..01721e6 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv1d.c @@ -55,8 +55,8 @@ static vsi_status op_compute { vsi_nn_tensor_t* new_inputs[3] = { NULL }; vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; - uint32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; - int32_t new_rank = 0; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t new_rank = 0; int32_t pad_front = self->nn_param.conv1d.pad[0]; int32_t pad_end = self->nn_param.conv1d.pad[1]; @@ -66,7 +66,7 @@ static vsi_status op_compute shape[1] = 1; new_rank = 2; reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, - inputs[0], (uint32_t*)shape, new_rank ); + inputs[0], shape, new_rank ); new_inputs[0] = reshape_tensors[0]; } else @@ -80,7 +80,7 @@ static vsi_status op_compute shape[1] = 1; new_rank = 2; reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, - inputs[1], (uint32_t*)shape, new_rank ); + inputs[1], shape, new_rank ); new_inputs[1] = reshape_tensors[1]; } else @@ -94,7 +94,7 @@ static vsi_status op_compute shape[1] = 1; new_rank = 2; reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, - inputs[2], (uint32_t*)shape, new_rank ); + inputs[2], shape, new_rank ); new_inputs[2] = reshape_tensors[2]; } else @@ -198,6 +198,142 @@ static vsi_bool op_check IO_TYPE(D_BF16, D_BF16, D_F32, D_BF16) IO_TYPE(D_BF16, D_BF16, D_F32, D_F32) IO_TYPE(D_BF16, D_BF16, D_NONE, D_BF16) + + /* HW 9.0.1 */ + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_F32) + + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_F16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_F32) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_F32) + + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_F16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_F32) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_F32) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_F32) + + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + + IO_TYPE(D_F16, D_F16, D_NONE, D_BF16) + IO_TYPE(D_F16, D_F16, D_NONE, D_F32) + IO_TYPE(D_F16, D_F16, D_F32, D_BF16) + IO_TYPE(D_F16, D_F16, D_F32, D_F32) + + IO_TYPE(D_BF16, D_BF16, D_NONE, D_F16) + IO_TYPE(D_BF16, D_BF16, D_F32, D_F16) + + IO_TYPE(D_F32, D_BF16, D_NONE, D_F16) + IO_TYPE(D_F32, D_BF16, D_NONE, D_BF16) + IO_TYPE(D_F32, D_BF16, D_NONE, D_F32) + IO_TYPE(D_F32, D_BF16, D_F32, D_F16) + IO_TYPE(D_F32, D_BF16, D_F32, D_BF16) + IO_TYPE(D_F32, D_BF16, D_F32, D_F32) + END_IO_TYPE_DECL(CONV1D) if (!VALIDATE_OP_IO_TYPES(CONV1D, self, inputs, self->input.num, outputs, self->output.num)) { @@ -342,4 +478,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c index bc8540d..5c0b7ad 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d.c @@ -147,7 +147,6 @@ static vsi_bool op_check /* IO_TYPE(INPUT, WEIGHT, NULL, OUTPUT) */ IO_TYPE(D_F32, D_F32, D_NONE, D_F32) - IO_TYPE(D_F16, D_F16, D_NONE, D_F16) IO_TYPE(D_F16, D_F16, D_NONE, D_F16) IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_I8|Q_DFP) @@ -172,6 +171,141 @@ static vsi_bool op_check /* HW 9.0 */ IO_TYPE(D_F32, D_BF16, D_F32, D_BF16) IO_TYPE(D_F32, D_BF16, D_NONE, D_BF16) + /* HW 9.0.1 */ + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_F32) + + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_F16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_F32) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_F32) + + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_F16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_F32) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_F32) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_F32) + + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + + IO_TYPE(D_F16, D_F16, D_NONE, D_BF16) + IO_TYPE(D_F16, D_F16, D_NONE, D_F32) + IO_TYPE(D_F16, D_F16, D_F32, D_BF16) + IO_TYPE(D_F16, D_F16, D_F32, D_F32) + + IO_TYPE(D_BF16, D_BF16, D_NONE, D_F16) + IO_TYPE(D_BF16, D_BF16, D_F32, D_F16) + + IO_TYPE(D_F32, D_BF16, D_NONE, D_F16) + IO_TYPE(D_F32, D_BF16, D_NONE, D_BF16) + IO_TYPE(D_F32, D_BF16, D_NONE, D_F32) + IO_TYPE(D_F32, D_BF16, D_F32, D_F16) + IO_TYPE(D_F32, D_BF16, D_F32, D_BF16) + IO_TYPE(D_F32, D_BF16, D_F32, D_F32) + END_IO_TYPE_DECL(CONV2D) ret = VALIDATE_OP_IO_TYPES(CONV2D, self, inputs, self->input.num, outputs, self->output.num); if(!ret) { @@ -200,8 +334,13 @@ static vsi_bool op_setup ) { vsi_nn_conv2d_param *nn_param; - uint32_t perm[] = { 3, 2, 0, 1 }; + vsi_size_t perm[] = { 3, 2, 0, 1 }; + vsi_size_t i, pad[_cnt_of_array(self->nn_param.conv2d.pad)] = {0}; + for(i = 0; i < _cnt_of_array(self->nn_param.conv2d.pad); i++) + { + pad[i] = self->nn_param.conv2d.pad[i]; + } /* TODO: Driver should handle this, * Check transpose * */ @@ -220,14 +359,19 @@ static vsi_bool op_setup #endif nn_param = &self->nn_param.conv2d; + vsi_nn_compute_padding( inputs[0]->attr.size, inputs[1]->attr.size, self->nn_param.conv2d.stride, self->nn_param.conv2d.dilation, self->nn_param.conv2d.pad_type, - self->nn_param.conv2d.pad + pad ); + for(i = 0; i < _cnt_of_array(self->nn_param.conv2d.pad); i++) + { + self->nn_param.conv2d.pad[i] = (uint32_t)pad[i]; + } if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { @@ -286,4 +430,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c new file mode 100644 index 0000000..af34411 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm.c @@ -0,0 +1,681 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "libnnext/vsi_nn_vxkernel.h" +#include "vsi_nn_internal_node.h" +#include "vsi_nn_rnn_helper.h" + +static vsi_nn_internal_tensor_t * reshape_cell_out + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * cell_out + ) +{ + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* output_tensor = NULL; + vsi_size_t* reshape_cell_size = NULL; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + + vsi_nn_internal_init_tensor_attr(&attr, &cell_out->attr.dtype, TRUE); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + /* reshape cell_out [w,h,c,n] to [w,h,c,1,n] */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 ); + reshape_cell_size = vsi_nn_internal_new_node_param(curr, + VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + reshape_cell_size[0] = cell_out->attr.size[0]; + reshape_cell_size[1] = cell_out->attr.size[1]; + reshape_cell_size[2] = cell_out->attr.size[2]; + reshape_cell_size[3] = 1; + reshape_cell_size[4] = cell_out->attr.size[3]; + curr->node->nn_param.reshape.size = reshape_cell_size; + curr->node->nn_param.reshape.dim_num = 5; + + curr->inputs[0] = cell_out; + curr->outputs[0] = output_tensor->t; + + vsi_nn_internal_setup_node( self, curr ); + return output_tensor; +} /* reshape_cell_out() */ + +static vsi_nn_internal_tensor_t * reshape_split_out + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * split_out + ) +{ + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t* output_tensor = NULL; + vsi_size_t *reshape_split_size = NULL; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + vsi_nn_internal_init_tensor_attr(&attr, &split_out->attr.dtype, TRUE); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + + /* reshape [w,h,c,t,n] to [w,h,c,n] */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 ); + reshape_split_size = vsi_nn_internal_new_node_param(curr, + VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + reshape_split_size[0] = split_out->attr.size[0]; + reshape_split_size[1] = split_out->attr.size[1]; + reshape_split_size[2] = split_out->attr.size[2]; + reshape_split_size[3] = split_out->attr.size[4]; + curr->node->nn_param.reshape.size = reshape_split_size; + curr->node->nn_param.reshape.dim_num = 4; + + curr->inputs[0] = split_out; + curr->outputs[0] = output_tensor->t; + vsi_nn_internal_setup_node( self, curr ); + + return output_tensor; +} /* reshape_split_out() */ + +static void split_input_tensor + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t ** output, + uint32_t time_step + ) +{ + uint32_t i; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_node_t* curr = NULL; + uint32_t * slices = NULL; + vsi_nn_internal_tensor_t* output_tensor = NULL; + + i = 0; + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_SPLIT, 1, time_step ); + slices = (uint32_t *)vsi_nn_internal_new_node_param(curr, time_step * sizeof(uint32_t)); + curr->node->nn_param.split.axis = 3; /* input_shape [w,h,c,t,n] */ + curr->node->nn_param.split.slices_num = time_step; + curr->inputs[0] = input; + curr->node->nn_param.split.slices = slices; + + for( i = 0; i < time_step; i++ ) + { + slices[i] = 1; + vsi_nn_internal_init_tensor_attr(&attr, &input->attr.dtype, TRUE); + output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + curr->outputs[i] = output_tensor->t; + output[i] = output_tensor->t; + } + vsi_nn_internal_setup_node( self, curr ); +} /* split_input_tensor() */ + +static void trans_output_tensor + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_size_t perm[VSI_NN_MAX_DIM_NUM], ID; + vsi_nn_conv2d_lstm_param * p = &self->nn_param.conv2d_lstm; + + ID = 0; + memset(perm, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM); + + // out1,out2 [w,h,c,n] --> [c,w,h,n] + perm[0] = 2; + perm[1] = 0; + perm[2] = 1; + perm[3] = 3; + ID = CONV2D_LSTM_OUT_H_STATE; + vsi_nn_rnn_create_permute(self, inputs[ID], outputs[ID], perm, 4, TRUE); + + ID = CONV2D_LSTM_OUT_C_STATE; + vsi_nn_rnn_create_permute(self, inputs[ID], outputs[ID], perm, 4, TRUE); + + ID = CONV2D_LSTM_OUT_OUTPUT; + if(p->return_sequences == TRUE) + { + // out0 [w,h,c,t,n] --> [c,w,h,t,n] + perm[0] = 2; + perm[1] = 0; + perm[2] = 1; + perm[3] = 3; + perm[4] = 4; + vsi_nn_rnn_create_permute(self, inputs[ID], outputs[ID], perm, 5, TRUE); + } + else + { + vsi_nn_rnn_create_permute(self, inputs[ID], outputs[ID], perm, 4, TRUE); + } +} /* trans_output_tensor() */ + +static void trans_input_tensor + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** trans_inputs + ) +{ + vsi_size_t perm[VSI_NN_MAX_DIM_NUM]; + vsi_nn_internal_tensor_t * tmp_tensor = NULL; + vsi_nn_conv2d_lstm_param * p = &self->nn_param.conv2d_lstm; + + memset(perm, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM); + if(p->data_format == CONV2D_LSTM_CHANNELS_LAST) + { + // [c,w,h,t,n] --> [w,h,c,t,n] + perm[0] = 1; + perm[1] = 2; + perm[2] = 0; + perm[3] = 3; + perm[4] = 4; + tmp_tensor = vsi_nn_rnn_create_permute(self, inputs[CONV2D_LSTM_IN_INPUT], NULL, perm, 5, TRUE); + trans_inputs[CONV2D_LSTM_IN_INPUT] = tmp_tensor->t; + + // [c,w,h,n] --> [w,h,c,n] + perm[0] = 1; + perm[1] = 2; + perm[2] = 0; + perm[3] = 3; + tmp_tensor = vsi_nn_rnn_create_permute(self, inputs[CONV2D_LSTM_IN_H_STATE], NULL, perm, 4, TRUE); + trans_inputs[CONV2D_LSTM_IN_H_STATE] = tmp_tensor->t; + + tmp_tensor = vsi_nn_rnn_create_permute(self, inputs[CONV2D_LSTM_IN_C_STATE], NULL, perm, 4, TRUE); + trans_inputs[CONV2D_LSTM_IN_C_STATE] = tmp_tensor->t; + } + else + { + trans_inputs[CONV2D_LSTM_IN_INPUT] = inputs[CONV2D_LSTM_IN_INPUT]; + trans_inputs[CONV2D_LSTM_IN_H_STATE] = inputs[CONV2D_LSTM_IN_H_STATE]; + trans_inputs[CONV2D_LSTM_IN_C_STATE] = inputs[CONV2D_LSTM_IN_C_STATE]; + } +} /* trans_input_tensor() */ + +static void create_state_tensor + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_size_t w_out, + vsi_size_t h_out, + vsi_size_t out_channel + ) +{ + vsi_size_t samples, state_shape[4]; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t * tensor = NULL; + vsi_nn_conv2d_lstm_param * p = &self->nn_param.conv2d_lstm; + + samples = inputs[CONV2D_LSTM_IN_INPUT]->attr.size[4]; + memset(state_shape, 0, sizeof(vsi_size_t) * 4); + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + + if(p->data_format == CONV2D_LSTM_CHANNELS_LAST) + { + state_shape[0] = out_channel; + state_shape[1] = w_out; + state_shape[2] = h_out; + state_shape[3] = samples; + } + else + { + state_shape[0] = w_out; + state_shape[1] = h_out; + state_shape[2] = out_channel; + state_shape[3] = samples; + } + + if(NULL == inputs[CONV2D_LSTM_IN_H_STATE]) + { + attr.dim_num = 4; + memcpy(attr.size, state_shape, sizeof(vsi_size_t) * attr.dim_num); + memcpy(&attr.dtype, &outputs[CONV2D_LSTM_OUT_OUTPUT]->attr.dtype, sizeof( attr.dtype )); + attr.vtl = FALSE; + attr.is_const = TRUE; + + tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + inputs[CONV2D_LSTM_IN_H_STATE] = tensor->t; + } + + if(NULL == inputs[CONV2D_LSTM_IN_C_STATE]) + { + attr.dim_num = 4; + memcpy(attr.size, state_shape, sizeof(vsi_size_t) * attr.dim_num); + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + attr.vtl = FALSE; + attr.is_const = TRUE; + + tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + inputs[CONV2D_LSTM_IN_C_STATE] = tensor->t; + } + + if(NULL == outputs[CONV2D_LSTM_OUT_H_STATE]) + { + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + attr.dim_num = VSI_NN_DIM_AUTO; + memcpy( &attr.dtype, &outputs[CONV2D_LSTM_OUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) ); + attr.vtl = TRUE; + attr.is_const = FALSE; + tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + outputs[CONV2D_LSTM_OUT_H_STATE] = tensor->t; + } + + if(NULL == outputs[CONV2D_LSTM_OUT_C_STATE]) + { + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); + attr.dim_num = VSI_NN_DIM_AUTO; + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + attr.vtl = TRUE; + attr.is_const = FALSE; + tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + outputs[CONV2D_LSTM_OUT_C_STATE] = tensor->t; + } +} /* create_state_tensor() */ + +static vsi_bool setup_op_shapes + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_size_t w_out, h_out, samples, timestep, out_channel; + vsi_size_t conv_in_shape[4]; + vsi_nn_conv2d_lstm_param * p = &self->nn_param.conv2d_lstm; + vsi_size_t ksize[_cnt_of_array(p->conv2d.ksize)]; + vsi_size_t i, pad[_cnt_of_array(p->conv2d.pad)] = {0}; + + memset(&attr, 0, sizeof(attr)); + memset(conv_in_shape, 0, sizeof(vsi_size_t) * 4); + w_out = 0; + h_out = 0; + timestep = inputs[CONV2D_LSTM_IN_INPUT]->attr.size[3]; + samples = inputs[CONV2D_LSTM_IN_INPUT]->attr.size[4]; + out_channel = p->filters; + + // conv_in_shape is always whcn + if(p->data_format == CONV2D_LSTM_CHANNELS_LAST) + { + /* input: [in_channel, w, h, time_step, batch] */ + conv_in_shape[0] = inputs[CONV2D_LSTM_IN_INPUT]->attr.size[1]; + conv_in_shape[1] = inputs[CONV2D_LSTM_IN_INPUT]->attr.size[2]; + conv_in_shape[2] = inputs[CONV2D_LSTM_IN_INPUT]->attr.size[0]; + conv_in_shape[3] = inputs[CONV2D_LSTM_IN_INPUT]->attr.size[4]; + } + else + { + /* input: [w, h, in_channel, time_step, batch] */ + conv_in_shape[0] = inputs[CONV2D_LSTM_IN_INPUT]->attr.size[0]; + conv_in_shape[1] = inputs[CONV2D_LSTM_IN_INPUT]->attr.size[1]; + conv_in_shape[2] = inputs[CONV2D_LSTM_IN_INPUT]->attr.size[2]; + conv_in_shape[3] = inputs[CONV2D_LSTM_IN_INPUT]->attr.size[4]; + } + + for(i = 0; i < _cnt_of_array(p->conv2d.ksize); i++) + { + ksize[i] = self->nn_param.conv2d.ksize[i]; + } + for(i = 0; i < _cnt_of_array(p->conv2d.pad); i++) + { + pad[i] = self->nn_param.conv2d.pad[i]; + } + + vsi_nn_compute_padding( + conv_in_shape, + ksize, + p->conv2d.stride, + p->conv2d.dilation, + p->conv2d.pad_type, + pad + ); + for(i = 0; i < _cnt_of_array(p->conv2d.ksize); i++) + { + self->nn_param.conv2d.ksize[i] = (uint32_t)ksize[i]; + } + for(i = 0; i < _cnt_of_array(p->conv2d.pad); i++) + { + self->nn_param.conv2d.pad[i] = (uint32_t)pad[i]; + } + w_out = vsi_nn_ComputeFilterSize( + conv_in_shape[0], + p->conv2d.ksize[0], + &p->conv2d.pad[0], + p->conv2d.stride[0], + p->conv2d.dilation[0], + VSI_NN_ROUND_FLOOR + ); + h_out = vsi_nn_ComputeFilterSize( + conv_in_shape[1], + p->conv2d.ksize[1], + &p->conv2d.pad[2], + p->conv2d.stride[1], + p->conv2d.dilation[1], + VSI_NN_ROUND_FLOOR + ); + + /* setup conv2d lstm output tensors' shape */ + /* output */ + if(VSI_NN_DIM_AUTO == outputs[CONV2D_LSTM_OUT_OUTPUT]->attr.dim_num) + { + if(p->data_format == CONV2D_LSTM_CHANNELS_LAST) + { + outputs[CONV2D_LSTM_OUT_OUTPUT]->attr.size[0] = out_channel; + outputs[CONV2D_LSTM_OUT_OUTPUT]->attr.size[1] = w_out; + outputs[CONV2D_LSTM_OUT_OUTPUT]->attr.size[2] = h_out; + } + else + { + outputs[CONV2D_LSTM_OUT_OUTPUT]->attr.size[0] = w_out; + outputs[CONV2D_LSTM_OUT_OUTPUT]->attr.size[1] = h_out; + outputs[CONV2D_LSTM_OUT_OUTPUT]->attr.size[2] = out_channel; + } + outputs[CONV2D_LSTM_OUT_OUTPUT]->attr.size[3] = timestep; + outputs[CONV2D_LSTM_OUT_OUTPUT]->attr.size[4] = samples; + outputs[CONV2D_LSTM_OUT_OUTPUT]->attr.dim_num = 5; + } + + /* create hstate and cstate input/output if app doesn't provide them */ + create_state_tensor(self, inputs, outputs, w_out, h_out, out_channel); + + /* hidden state output */ + if(VSI_NN_DIM_AUTO == outputs[CONV2D_LSTM_OUT_H_STATE]->attr.dim_num) + { + if(p->data_format == CONV2D_LSTM_CHANNELS_LAST) + { + outputs[CONV2D_LSTM_OUT_H_STATE]->attr.size[0] = out_channel; + outputs[CONV2D_LSTM_OUT_H_STATE]->attr.size[1] = w_out; + outputs[CONV2D_LSTM_OUT_H_STATE]->attr.size[2] = h_out; + } + else + { + outputs[CONV2D_LSTM_OUT_H_STATE]->attr.size[0] = w_out; + outputs[CONV2D_LSTM_OUT_H_STATE]->attr.size[1] = h_out; + outputs[CONV2D_LSTM_OUT_H_STATE]->attr.size[2] = out_channel; + } + outputs[CONV2D_LSTM_OUT_H_STATE]->attr.size[3] = samples; + outputs[CONV2D_LSTM_OUT_H_STATE]->attr.dim_num = 4; + } + + /* cell state output */ + if(VSI_NN_DIM_AUTO == outputs[CONV2D_LSTM_OUT_C_STATE]->attr.dim_num) + { + if(p->data_format == CONV2D_LSTM_CHANNELS_LAST) + { + outputs[CONV2D_LSTM_OUT_C_STATE]->attr.size[0] = out_channel; + outputs[CONV2D_LSTM_OUT_C_STATE]->attr.size[1] = w_out; + outputs[CONV2D_LSTM_OUT_C_STATE]->attr.size[2] = h_out; + } + else + { + outputs[CONV2D_LSTM_OUT_C_STATE]->attr.size[0] = w_out; + outputs[CONV2D_LSTM_OUT_C_STATE]->attr.size[1] = h_out; + outputs[CONV2D_LSTM_OUT_C_STATE]->attr.size[2] = out_channel; + } + outputs[CONV2D_LSTM_OUT_C_STATE]->attr.size[3] = samples; + outputs[CONV2D_LSTM_OUT_C_STATE]->attr.dim_num = 4; + } + + return TRUE; +} /* setup_op_shapes() */ + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} /* op_optimize() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_size_t i, timestep, perm[VSI_NN_MAX_DIM_NUM]; + vsi_nn_tensor_t * trans_inputs[3] = { NULL }; + vsi_nn_tensor_t * conv2dlstm_outputs[3] = { NULL }; + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t * tmp_tensor = NULL; + vsi_nn_tensor_t ** split_outputs = NULL, ** conv2dlstm_step_outputs = NULL; + vsi_nn_tensor_t * step_h_state = NULL, * step_c_state = NULL; + vsi_nn_tensor_t * cell_out0 = NULL, * cell_out1 = NULL, * cell_out2 = NULL; + vsi_nn_conv2d_lstm_param * p = &self->nn_param.conv2d_lstm; + vsi_nn_internal_node_t* curr = NULL; + + memset(&attr, 0, sizeof(attr)); + memset(perm, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM); + timestep = inputs[CONV2D_LSTM_IN_INPUT]->attr.size[3]; + + vsi_nn_internal_init_node_wksp( self ); + + setup_op_shapes(self, inputs, outputs); + + trans_input_tensor(self, inputs, trans_inputs); + + split_outputs = (vsi_nn_tensor_t **)malloc(sizeof(vsi_nn_tensor_t *) * timestep); + conv2dlstm_step_outputs = (vsi_nn_tensor_t **)malloc(sizeof(vsi_nn_tensor_t *) * timestep); + memset(split_outputs, 0, sizeof(vsi_nn_tensor_t *) * timestep); + memset(conv2dlstm_step_outputs, 0, sizeof(vsi_nn_tensor_t *) * timestep); + + /* split input tensor by time-step */ + split_input_tensor(self, trans_inputs[CONV2D_LSTM_IN_INPUT], split_outputs, (uint32_t)timestep); + + cell_out0 = cell_out1 = cell_out2 = NULL; + step_h_state = trans_inputs[CONV2D_LSTM_IN_H_STATE]; + step_c_state = trans_inputs[CONV2D_LSTM_IN_C_STATE]; + for( i = 0; i < timestep; i++ ) + { + vsi_nn_tensor_t * reshape_output = NULL; + + /* reshape for split output */ + tmp_tensor = reshape_split_out(self, split_outputs[i]); + reshape_output = tmp_tensor->t; + + if((i == timestep - 1) && p->return_sequences == FALSE && p->data_format == CONV2D_LSTM_CHANNELS_FIRST) + { + cell_out0 = outputs[CONV2D_LSTM_OUT_OUTPUT]; + } + else + { + vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_OUT_OUTPUT]->attr.dtype, TRUE); + tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + cell_out0 = tmp_tensor->t; + } + + if((i == timestep - 1) && p->data_format == CONV2D_LSTM_CHANNELS_FIRST) + { + cell_out1 = outputs[CONV2D_LSTM_OUT_H_STATE]; + cell_out2 = outputs[CONV2D_LSTM_OUT_C_STATE]; + } + else + { + /* conv2d_lstm hstate output */ + vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_OUT_H_STATE]->attr.dtype, TRUE); + tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + cell_out1 = tmp_tensor->t; + + /* conv2d_lstm cstate output */ + vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_OUT_C_STATE]->attr.dtype, TRUE); + tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + cell_out2 = tmp_tensor->t; + } + + /* create a conv2d_lstm_cell */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONV2D_LSTM_CELL, 0, 0 ); + curr->node->nn_param.conv2d_lstm_cell.filters = p->filters; + curr->node->nn_param.conv2d_lstm_cell.activation = p->activation; + curr->node->nn_param.conv2d_lstm_cell.recurrent_activation = p->recurrent_activation; + memcpy(&curr->node->nn_param.conv2d_lstm_cell.conv2d, &p->conv2d, sizeof(p->conv2d)); + + curr->inputs[CONV2D_LSTM_CELL_IN_INPUT] = reshape_output; + curr->inputs[CONV2D_LSTM_CELL_IN_H_STATE] = step_h_state; + curr->inputs[CONV2D_LSTM_CELL_IN_C_STATE] = step_c_state; + curr->inputs[CONV2D_LSTM_CELL_IN_KERNEL_I2I] = inputs[CONV2D_LSTM_IN_KERNEL_I2I]; + curr->inputs[CONV2D_LSTM_CELL_IN_KERNEL_I2F] = inputs[CONV2D_LSTM_IN_KERNEL_I2F]; + curr->inputs[CONV2D_LSTM_CELL_IN_KERNEL_I2C] = inputs[CONV2D_LSTM_IN_KERNEL_I2C]; + curr->inputs[CONV2D_LSTM_CELL_IN_KERNEL_I2O] = inputs[CONV2D_LSTM_IN_KERNEL_I2O]; + curr->inputs[CONV2D_LSTM_CELL_IN_KERNEL_R2I] = inputs[CONV2D_LSTM_IN_KERNEL_R2I]; + curr->inputs[CONV2D_LSTM_CELL_IN_KERNEL_R2F] = inputs[CONV2D_LSTM_IN_KERNEL_R2F]; + curr->inputs[CONV2D_LSTM_CELL_IN_KERNEL_R2C] = inputs[CONV2D_LSTM_IN_KERNEL_R2C]; + curr->inputs[CONV2D_LSTM_CELL_IN_KERNEL_R2O] = inputs[CONV2D_LSTM_IN_KERNEL_R2O]; + curr->inputs[CONV2D_LSTM_CELL_IN_BIAS_I] = inputs[CONV2D_LSTM_IN_BIAS_I]; + curr->inputs[CONV2D_LSTM_CELL_IN_BIAS_F] = inputs[CONV2D_LSTM_IN_BIAS_F]; + curr->inputs[CONV2D_LSTM_CELL_IN_BIAS_C] = inputs[CONV2D_LSTM_IN_BIAS_C]; + curr->inputs[CONV2D_LSTM_CELL_IN_BIAS_O] = inputs[CONV2D_LSTM_IN_BIAS_O]; + curr->outputs[CONV2D_LSTM_CELL_OUT_OUTPUT] = cell_out0; + curr->outputs[CONV2D_LSTM_CELL_OUT_H_STATE] = cell_out1; + curr->outputs[CONV2D_LSTM_CELL_OUT_C_STATE] = cell_out2; + + vsi_nn_internal_setup_node( self, curr ); + + /* update the state tensor for next time-step hstate and cstate */ + step_h_state = cell_out1; + step_c_state = cell_out2; + + if(p->return_sequences == TRUE) + { + /* store step's outputs */ + tmp_tensor = reshape_cell_out(self, cell_out0); + conv2dlstm_step_outputs[i] = tmp_tensor->t; + } + } + + if(p->return_sequences == TRUE) + { + if(p->data_format == CONV2D_LSTM_CHANNELS_LAST) + { + vsi_nn_internal_init_tensor_attr(&attr, &outputs[CONV2D_LSTM_OUT_OUTPUT]->attr.dtype, TRUE); + tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + conv2dlstm_outputs[CONV2D_LSTM_OUT_OUTPUT] = tmp_tensor->t; + } + else + { + conv2dlstm_outputs[CONV2D_LSTM_OUT_OUTPUT] = outputs[CONV2D_LSTM_OUT_OUTPUT]; + } + /* concat all step's output0 data on dimension t --- cell out0 shape: [w,h,c,t,n] */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)timestep, 1 ); + curr->node->nn_param.concat.axis = 3; + for(i = 0; i < timestep; i++) + { + curr->inputs[i] = conv2dlstm_step_outputs[i]; + } + curr->outputs[0] = conv2dlstm_outputs[CONV2D_LSTM_OUT_OUTPUT]; + vsi_nn_internal_setup_node( self, curr ); + } + else + { + conv2dlstm_outputs[CONV2D_LSTM_OUT_OUTPUT] = cell_out0; + } + + conv2dlstm_outputs[CONV2D_LSTM_OUT_H_STATE] = cell_out1; + conv2dlstm_outputs[CONV2D_LSTM_OUT_C_STATE] = cell_out2; + if(p->data_format == CONV2D_LSTM_CHANNELS_LAST) + { + trans_output_tensor(self, conv2dlstm_outputs, outputs); + } + + vsi_nn_safe_free(split_outputs); + vsi_nn_safe_free(conv2dlstm_step_outputs) + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + vsi_nn_internal_deinit_node_wksp( self ); + return status; +} /* op_deinit() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + return status; +} /* op_init() */ + +#ifdef __cpluplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ CONV2D_LSTM, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ CONV2D_LSTM_IN_CNT, + /* output_num */ CONV2D_LSTM_OUT_CNT + ); +#ifdef __cpluplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c new file mode 100644 index 0000000..4399d22 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv2d_lstm_cell.c @@ -0,0 +1,414 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_util.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "libnnext/vsi_nn_vxkernel.h" +#include "vsi_nn_internal_node.h" +#include "vsi_nn_rnn_helper.h" + +static vsi_nn_internal_tensor_t * reshape_tensor_to_act + ( + vsi_nn_node_t* self, + vsi_nn_tensor_t * tensor + ) +{ + vsi_nn_internal_tensor_t * reshape_out = NULL; + vsi_size_t i,dim,reshaped_size[2]; + vsi_size_t sz; + + memset(reshaped_size, 0, sizeof(vsi_size_t) * 2); + dim = 2; + sz = 1; + for(i = 0; i < tensor->attr.dim_num - 1; i++) + { + sz *= tensor->attr.size[i]; + } + + /* reshape 4d tensor to [-1, 0] */ + reshaped_size[0] = sz; + reshaped_size[1] = tensor->attr.size[tensor->attr.dim_num - 1]; + reshape_out = vsi_nn_rnn_create_reshape(self, tensor, NULL, reshaped_size, dim, TRUE); + + return reshape_out; +} /* reshape_tensor_to_act() */ + +static vsi_nn_internal_tensor_t * create_input_conv + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * weight, + vsi_nn_tensor_t * bias + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_conv2d_lstm_cell_param * p; + vsi_nn_internal_tensor_t * input_conv_out = NULL, * reshape_out = NULL; + vsi_nn_internal_node_t * input_conv= NULL; + + p = &self->nn_param.conv2d_lstm_cell; + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = TRUE; + attr.is_const = FALSE; + input_conv_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + + input_conv = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV2D, 0, 0 ); + input_conv->node->nn_param.conv2d.group = 1; + input_conv->node->nn_param.conv2d.ksize[0] = p->conv2d.ksize[0]; + input_conv->node->nn_param.conv2d.ksize[1] = p->conv2d.ksize[1]; + input_conv->node->nn_param.conv2d.weights = p->filters; + input_conv->node->nn_param.conv2d.pad[0] = p->conv2d.pad[0]; + input_conv->node->nn_param.conv2d.pad[1] = p->conv2d.pad[1]; + input_conv->node->nn_param.conv2d.pad[2] = p->conv2d.pad[2]; + input_conv->node->nn_param.conv2d.pad[3] = p->conv2d.pad[3]; + input_conv->node->nn_param.conv2d.pad_type = VSI_NN_PAD_AUTO; + input_conv->node->nn_param.conv2d.stride[0] = p->conv2d.stride[0]; + input_conv->node->nn_param.conv2d.stride[1] = p->conv2d.stride[1]; + input_conv->node->nn_param.conv2d.dilation[0] = p->conv2d.dilation[0]; + input_conv->node->nn_param.conv2d.dilation[1] = p->conv2d.dilation[1]; + input_conv->node->nn_param.conv2d.multiplier = 0; + input_conv->node->vx_param.overflow_policy = self->vx_param.overflow_policy; + input_conv->node->vx_param.rounding_policy = self->vx_param.rounding_policy; + input_conv->node->vx_param.down_scale_size_rounding = self->vx_param.down_scale_size_rounding; + + input_conv->inputs[0] = input; + input_conv->inputs[1] = weight; + input_conv->inputs[2] = bias; + input_conv->outputs[0] = input_conv_out->t; + vsi_nn_internal_setup_node(self, input_conv); + + // reshape whcn --> xn + reshape_out = reshape_tensor_to_act(self, input_conv_out->t); + + return reshape_out; +} /* create_input_conv() */ + +static vsi_nn_internal_tensor_t * create_recurrent_conv + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * weight + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_conv2d_lstm_cell_param * p = NULL; + vsi_nn_tensor_t * bias = NULL; + vsi_nn_internal_tensor_t * recurrent_conv_out = NULL; + vsi_nn_internal_tensor_t * internal_bias = NULL; + vsi_nn_internal_tensor_t * reshape_out = NULL; + vsi_nn_internal_node_t * recurrent_conv = NULL; + + p = &self->nn_param.conv2d_lstm_cell; + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + + internal_bias = vsi_nn_internal_create_zero_bias_tensor( + self, &input->attr, &weight->attr, VSI_NN_OP_CONV2D, FALSE); + bias = internal_bias->t; + + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = TRUE; + attr.is_const = FALSE; + recurrent_conv_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + + recurrent_conv = vsi_nn_internal_new_node(self, VSI_NN_OP_CONV2D, 0, 0 ); + recurrent_conv->node->nn_param.conv2d.pad_type = VSI_NN_PAD_SAME; + recurrent_conv->node->nn_param.conv2d.group = 1; + recurrent_conv->node->nn_param.conv2d.ksize[0] = p->conv2d.ksize[0]; + recurrent_conv->node->nn_param.conv2d.ksize[1] = p->conv2d.ksize[1]; + recurrent_conv->node->nn_param.conv2d.stride[0] = 1; + recurrent_conv->node->nn_param.conv2d.stride[1] = 1; + recurrent_conv->node->nn_param.conv2d.dilation[0] = 1; + recurrent_conv->node->nn_param.conv2d.dilation[1] = 1; + recurrent_conv->node->nn_param.conv2d.weights = p->filters; + recurrent_conv->node->nn_param.conv2d.multiplier = 0; + + recurrent_conv->node->vx_param.overflow_policy = self->vx_param.overflow_policy; + recurrent_conv->node->vx_param.rounding_policy = self->vx_param.rounding_policy; + recurrent_conv->node->vx_param.down_scale_size_rounding = self->vx_param.down_scale_size_rounding; + + recurrent_conv->inputs[0] = input; + recurrent_conv->inputs[1] = weight; + recurrent_conv->inputs[2] = bias; + + recurrent_conv->outputs[0] = recurrent_conv_out->t; + vsi_nn_internal_setup_node(self, recurrent_conv); + + // reshape whcn --> xn + reshape_out = reshape_tensor_to_act(self, recurrent_conv_out->t); + return reshape_out; +} /* create_recurrent_conv() */ + +static vsi_bool setup_op_shapes + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool ret = TRUE; + vsi_nn_conv2d_lstm_cell_param *p = &self->nn_param.conv2d_lstm_cell; + vsi_size_t w_out, h_out, samples, out_channel; + vsi_size_t ksize[_cnt_of_array(p->conv2d.ksize)]; + vsi_size_t i, pad[_cnt_of_array(p->conv2d.pad)] = {0}; + + w_out = 0; + h_out = 0; + for( i = 0; i < _cnt_of_array(p->conv2d.ksize); i++ ) + { + ksize[i] = (vsi_size_t)p->conv2d.ksize[i]; + } + samples = inputs[CONV2D_LSTM_CELL_IN_INPUT]->attr.size[3]; + out_channel = p->filters; + for(i = 0; i < _cnt_of_array(p->conv2d.pad); i++) + { + pad[i] = self->nn_param.conv2d.pad[i]; + } + + vsi_nn_compute_padding( + inputs[CONV2D_LSTM_CELL_IN_INPUT]->attr.size, + ksize, + p->conv2d.stride, + p->conv2d.dilation, + p->conv2d.pad_type, + pad + ); + for(i = 0; i < _cnt_of_array(p->conv2d.ksize); i++) + { + self->nn_param.conv2d.ksize[i] = (uint32_t)ksize[i]; + } + for(i = 0; i < _cnt_of_array(p->conv2d.pad); i++) + { + self->nn_param.conv2d.pad[i] = (uint32_t)pad[i]; + } + + w_out = vsi_nn_ComputeFilterSize( + inputs[CONV2D_LSTM_CELL_IN_INPUT]->attr.size[0], + p->conv2d.ksize[0], + &p->conv2d.pad[0], + p->conv2d.stride[0], + p->conv2d.dilation[0], + VSI_NN_ROUND_FLOOR + ); + h_out = vsi_nn_ComputeFilterSize( + inputs[CONV2D_LSTM_CELL_IN_INPUT]->attr.size[1], + p->conv2d.ksize[1], + &p->conv2d.pad[2], + p->conv2d.stride[1], + p->conv2d.dilation[1], + VSI_NN_ROUND_FLOOR + ); + + /* setup conv2d lstm cell output tensors' shape */ + /* output */ + if(VSI_NN_DIM_AUTO == outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]->attr.dim_num) + { + outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]->attr.size[0] = w_out; + outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]->attr.size[1] = h_out; + outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]->attr.size[2] = out_channel; + outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]->attr.size[3] = samples; + outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]->attr.dim_num = 4; + } + + /* hidden state output */ + if(VSI_NN_DIM_AUTO == outputs[CONV2D_LSTM_CELL_OUT_H_STATE]->attr.dim_num) + { + memcpy(outputs[CONV2D_LSTM_CELL_OUT_H_STATE]->attr.size, outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]->attr.size, + sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM); + outputs[CONV2D_LSTM_CELL_OUT_H_STATE]->attr.dim_num = outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]->attr.dim_num; + } + + /* hidden state output */ + if(VSI_NN_DIM_AUTO == outputs[CONV2D_LSTM_CELL_OUT_C_STATE]->attr.dim_num) + { + memcpy(outputs[CONV2D_LSTM_CELL_OUT_C_STATE]->attr.size, outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]->attr.size, + sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM); + outputs[CONV2D_LSTM_CELL_OUT_C_STATE]->attr.dim_num = outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]->attr.dim_num; + } + + return ret; +} /* setup_op_shapes() */ + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + /*TODO: Check tensor shapes. */ + return TRUE; +} /* op_check() */ + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} /* op_optimize() */ + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + vsi_nn_internal_tensor_t * input_conv_outputs[CONV2D_LSTM_CELL_GATE_NUM] = { NULL }; + vsi_nn_internal_tensor_t * recurrent_conv_outputs[CONV2D_LSTM_CELL_GATE_NUM] = { NULL }; + vsi_nn_internal_node_t* curr = NULL; + vsi_nn_internal_tensor_t * reshape_cell_in = NULL; + vsi_nn_internal_tensor_t * reshape_out = NULL; + vsi_nn_internal_tensor_t * reshape_h_out = NULL; + vsi_nn_internal_tensor_t * reshape_c_out = NULL; + vsi_nn_conv2d_lstm_cell_param * p = &self->nn_param.conv2d_lstm_cell; + + vsi_nn_internal_init_node_wksp( self ); + + /* compute output tensor's shapes */ + setup_op_shapes(self, inputs, outputs); + + /* create input convolution */ + for(i = 0; i < CONV2D_LSTM_CELL_GATE_NUM; i++) + { + input_conv_outputs[i] = create_input_conv( + self, + inputs[CONV2D_LSTM_CELL_IN_INPUT], + inputs[CONV2D_LSTM_CELL_IN_KERNEL_I2I + i], + inputs[CONV2D_LSTM_CELL_IN_BIAS_I + i] + ); + } + + /* create recurrent convolution */ + for(i = 0; i < CONV2D_LSTM_CELL_GATE_NUM; i++) + { + recurrent_conv_outputs[i] = create_recurrent_conv( + self, + inputs[CONV2D_LSTM_CELL_IN_H_STATE], + inputs[CONV2D_LSTM_CELL_IN_KERNEL_R2I + i] + ); + } + + /* activations */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_LSTMUNIT_ACTIVATION, 0, 0 ); + curr->node->nn_param.lstmunit_activation.cell_clip = 0; + curr->node->nn_param.lstmunit_activation.proj_clip = 0; + curr->node->nn_param.lstmunit_activation.forget_bias = 0; + curr->node->nn_param.lstmunit_activation.is_cifg = 0; + curr->node->nn_param.lstmunit_activation.is_projection = 0; + curr->node->nn_param.lstmunit_activation.is_layer_norm = 0; + curr->node->nn_param.lstmunit_activation.is_peephole = FALSE; + curr->node->nn_param.lstmunit_activation.is_hybrid = 0; + curr->node->nn_param.lstmunit_activation.recurrent_activation = p->recurrent_activation; + + reshape_cell_in = reshape_tensor_to_act(self, inputs[CONV2D_LSTM_CELL_IN_C_STATE]); + curr->inputs[LSTMUNIT_ACT_CSTATE_IN] = reshape_cell_in->t; + for(i = 0; i < CONV2D_LSTM_CELL_GATE_NUM; i++) + { + curr->inputs[LSTMUNIT_ACT_LN_WI + i] = NULL; + curr->inputs[LSTMUNIT_ACT_INPUT_FC_I + i] = input_conv_outputs[i]->t; + curr->inputs[LSTMUNIT_ACT_HSTATE_FC_I + i] = recurrent_conv_outputs[i]->t; + } + reshape_out = reshape_tensor_to_act(self, outputs[CONV2D_LSTM_CELL_OUT_OUTPUT]); + reshape_h_out = reshape_tensor_to_act(self, outputs[CONV2D_LSTM_CELL_OUT_H_STATE]); + reshape_c_out = reshape_tensor_to_act(self, outputs[CONV2D_LSTM_CELL_OUT_C_STATE]); + + curr->outputs[LSTMUNIT_ACT_OUTPUT] = reshape_out->t; + curr->outputs[LSTMUNIT_ACT_CSTATE_OUT] = reshape_c_out->t; + curr->outputs[LSTMUNIT_ACT_HSTATE_OUT] = reshape_h_out->t; + vsi_nn_internal_setup_node(self, curr); + + return TRUE; +} /* op_setup() */ + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + return status; +} /* op_deinit() */ + +static vsi_status op_init + ( + vsi_nn_node_t * self + ) +{ + vsi_status status = VSI_SUCCESS; + + return status; +} /* op_init() */ + +#ifdef __cpluplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ CONV2D_LSTM_CELL, + /* init */ op_init, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ CONV2D_LSTM_CELL_IN_CNT, + /* output_num */ CONV2D_LSTM_CELL_OUT_CNT + ); +#ifdef __cpluplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu.c index 680af0a..6a6647f 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu.c @@ -190,9 +190,15 @@ static vsi_status op_optimize inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2( VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER, 4, +#ifdef VSI_40BIT_VA_SUPPORT inputs[0]->attr.size, outputs[0]->attr.size, outputs[0]->attr.size, +#else + (vx_uint32*)inputs[0]->attr.size, + (vx_uint32*)outputs[0]->attr.size, + (vx_uint32*)outputs[0]->attr.size, +#endif outputs[0]->attr.dtype.vx_type, (vx_nn_convolution_relu_pooling_params_t *)&p, sizeof(p), diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c index 2c56d23..f49ef3b 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_conv_relu_pool.c @@ -175,7 +175,6 @@ static vsi_status op_optimize vx_nn_convolution_relu_pooling_params_ext2_t p; vx_weights_biases_parameter_optimizations_t opt; vx_weights_biases_parameter_optimizations_t * p_opt; - ret = FALSE; status = VSI_FAILURE; @@ -215,6 +214,7 @@ static vsi_status op_optimize p_opt = &opt; } +#ifdef VSI_40BIT_VA_SUPPORT inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2( VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER, 4, @@ -227,6 +227,32 @@ static vsi_status op_optimize p_opt, inputs[1]->t, inputs[2]->t ); +#else + { + uint32_t size_u32_input0[VSI_NN_MAX_DIM_NUM]; + uint32_t size_u32_pconv_out[VSI_NN_MAX_DIM_NUM]; + uint32_t size_u32_output0[VSI_NN_MAX_DIM_NUM]; + size_t i = 0; + for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + size_u32_input0[i] = (uint32_t)inputs[0]->attr.size[i]; + size_u32_pconv_out[i] = (uint32_t)pconv_out->attr.size[i]; + size_u32_output0[i] = (uint32_t)outputs[0]->attr.size[i]; + } + inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors2( + VX_CONVOLUTIONAL_NETWORK_CONVOLUTION_LAYER, + 4, + size_u32_input0, + size_u32_pconv_out, + size_u32_output0, + outputs[0]->attr.dtype.vx_type, + (vx_nn_convolution_relu_pooling_params_t *)&p, + sizeof(p), + p_opt, + inputs[1]->t, inputs[2]->t + ); + } +#endif vsi_nn_DeinitConvReluPoolParameter( &p ); } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c b/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c index f99aa44..098c935 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_crop.c @@ -57,8 +57,8 @@ static vsi_status op_compute vsi_nn_tensor_t *end_dims_tensor = NULL; vsi_nn_tensor_t *stride_dims_tensor = NULL; vsi_nn_tensor_attr_t attr; - int32_t start[VSI_NN_MAX_DIM_NUM] = {0}; - int32_t end[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t start[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t end[VSI_NN_MAX_DIM_NUM] = {0}; int32_t stride[VSI_NN_MAX_DIM_NUM] = {0}; uint32_t i; @@ -146,30 +146,11 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { - BEGIN_IO_TYPE_DECL(CROP, 1, 1) - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_I32, D_I32) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_F16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16) - END_IO_TYPE_DECL(CROP) - if (!VALIDATE_OP_IO_TYPES(CROP, self, inputs, self->input.num, outputs, self->output.num)) - { - char* desc = generate_op_io_types_desc(inputs, - self->input.num, outputs, self->output.num); - VSILOGE("Inputs/Outputs data type not support: %s", desc); - destroy_op_io_types_desc(desc); - return FALSE; - } + vsi_bool ret = FALSE; - return TRUE; + ret = vsi_nn_OpCheck(VSI_NN_OP_STRIDED_SLICE, self, inputs, outputs); + + return ret; } static vsi_bool op_setup @@ -249,4 +230,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c index c3317e0..d91a7e6 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c @@ -41,7 +41,6 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - if (self->nn_param.dataconvert.lcl_data->use_reshape == FALSE && inputs[0]->t != NULL && outputs[0]->t != NULL) { @@ -105,8 +104,13 @@ static vsi_status op_optimize { if(NULL == inputs[0]->t && NULL != outputs[0]->t) { +#ifdef VSI_40BIT_VA_SUPPORT inputs[0]->t = vxReshapeTensor(outputs[0]->t, - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num); + inputs[0]->attr.size, inputs[0]->attr.dim_num); +#else + inputs[0]->t = vxReshapeTensor(outputs[0]->t, + (vx_int32*)inputs[0]->attr.size, inputs[0]->attr.dim_num); +#endif if( inputs[0]->t == NULL ) { VSILOGE("Call vxReshapeTensor fail"); @@ -119,8 +123,13 @@ static vsi_status op_optimize { if(NULL == outputs[0]->t && NULL != inputs[0]->t) { +#ifdef VSI_40BIT_VA_SUPPORT outputs[0]->t = vxReshapeTensor(inputs[0]->t, - (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num); + outputs[0]->attr.size, outputs[0]->attr.dim_num); +#else + outputs[0]->t = vxReshapeTensor(inputs[0]->t, + (vx_int32*)outputs[0]->attr.size, outputs[0]->attr.dim_num); +#endif if( outputs[0]->t == NULL ) { VSILOGE("Call vxReshapeTensor fail"); @@ -159,7 +168,6 @@ static vsi_status op_deinit { if(self->nn_param.dataconvert.lcl_data) { - free(self->nn_param.dataconvert.lcl_data); self->nn_param.dataconvert.lcl_data = NULL; } @@ -288,6 +296,17 @@ static vsi_bool op_check IO_TYPE(D_U32, D_U16) IO_TYPE(D_U32, D_U8|Q_ASYM) IO_TYPE(D_U32, D_U8) + + /* HW 9.0.1 */ + IO_TYPE(D_I8|Q_DFP, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_F32) + IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_F32) + IO_TYPE(D_F16, D_F32) END_IO_TYPE_DECL(DATACONVERT) if (!VALIDATE_OP_IO_TYPES(DATACONVERT, self, inputs, self->input.num, outputs, self->output.num)) { @@ -320,4 +339,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c index 133141d..4adcc43 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution.c @@ -172,7 +172,6 @@ final: } #endif return status; - } /* op_compute() */ static vsi_bool op_check @@ -220,6 +219,142 @@ static vsi_bool op_check /* HW 9.0 */ IO_TYPE(D_F32, D_BF16, D_F32, D_BF16) IO_TYPE(D_F32, D_BF16, D_NONE, D_BF16) + + /* HW 9.0.1 */ + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_F32) + + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_F16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_F32) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_F32) + + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_F16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_F32) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_F32) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_F32) + + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + + IO_TYPE(D_F16, D_F16, D_NONE, D_BF16) + IO_TYPE(D_F16, D_F16, D_NONE, D_F32) + IO_TYPE(D_F16, D_F16, D_F32, D_BF16) + IO_TYPE(D_F16, D_F16, D_F32, D_F32) + + IO_TYPE(D_BF16, D_BF16, D_NONE, D_F16) + IO_TYPE(D_BF16, D_BF16, D_F32, D_F16) + + IO_TYPE(D_F32, D_BF16, D_NONE, D_F16) + IO_TYPE(D_F32, D_BF16, D_NONE, D_BF16) + IO_TYPE(D_F32, D_BF16, D_NONE, D_F32) + IO_TYPE(D_F32, D_BF16, D_F32, D_F16) + IO_TYPE(D_F32, D_BF16, D_F32, D_BF16) + IO_TYPE(D_F32, D_BF16, D_F32, D_F32) + END_IO_TYPE_DECL(DECONVOLUTION) if (!VALIDATE_OP_IO_TYPES(DECONVOLUTION, self, inputs, self->input.num, outputs, self->output.num)) { @@ -244,8 +379,8 @@ static vsi_bool op_setup ) { vsi_nn_deconv_param *nn_param; - uint32_t perm[] = { 3, 2, 0, 1 }; - uint32_t perm1[] = { 0, 1, 3, 2 }; + vsi_size_t perm[] = { 3, 2, 0, 1 }; + vsi_size_t perm1[] = { 0, 1, 3, 2 }; /* TODO: Driver should handle this, * Check transpose @@ -287,9 +422,8 @@ static vsi_bool op_setup nn_param->group = ( 0 == nn_param->group ) ? 1 : nn_param->group; - nn_param->ksize[0] = inputs[1]->attr.size[0]; - nn_param->ksize[1] = inputs[1]->attr.size[1]; - + nn_param->ksize[0] = (uint32_t)inputs[1]->attr.size[0]; + nn_param->ksize[1] = (uint32_t)inputs[1]->attr.size[1]; if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { @@ -344,4 +478,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c index 3e971e7..982d0d4 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_deconvolution1d.c @@ -46,7 +46,7 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - uint32_t perm[] = { 0, 1, 3, 2 }; + vsi_size_t perm[] = { 0, 1, 3, 2 }; vsi_nn_tensor_attr_t weight_attr; vsi_status status = VSI_FAILURE; vsi_nn_kernel_param_t * param = NULL; @@ -159,7 +159,7 @@ static vsi_bool op_setup #endif nn_param->group = ( 0 == nn_param->group ) ? 1 : nn_param->group; - nn_param->ksize = inputs[1]->attr.size[0]; + nn_param->ksize = (uint32_t)inputs[1]->attr.size[0]; if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space.c b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space.c index 53c41ec..551aa59 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_depth2space.c @@ -98,7 +98,6 @@ static vsi_status op_compute return status; } - return status; } /* op_compute() */ @@ -127,13 +126,17 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + vsi_bool ret = FALSE; + if(self->nn_param.depth2space.block_size < 0) { VSILOGE("Block size can't be less than zero in depth to space"); return FALSE; } - return TRUE; + ret = vsi_nn_OpCheck(VSI_NN_OP_STRIDED_SLICE, self, inputs, outputs); + + return ret; } /* op_check() */ static void op_set_depth2space_param_value(vsi_nn_nn_param_t *nn_param, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_depthwise_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_depthwise_conv1d.c index 4835ab7..2a9f688 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_depthwise_conv1d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_depthwise_conv1d.c @@ -114,7 +114,7 @@ static vsi_bool op_setup outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize ( inputs[0]->attr.size[0], - inputs[1]->attr.size[0], + (uint32_t)inputs[1]->attr.size[0], p->pad, p->stride, p->dilation, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c index 528a72f..a82f521 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c @@ -48,8 +48,8 @@ static vsi_status _eltwise_op_compute { vsi_status status; vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; - int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; - uint32_t new_rank = 0; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + vsi_size_t new_rank = 0; vsi_bool ret = TRUE; vx_bool doShapeOptimized = TRUE; vsi_nn_kernel_param_t * param = NULL; @@ -76,9 +76,9 @@ static vsi_status _eltwise_op_compute if (doShapeOptimized) { ret = vsi_nn_kernel_optimize_eltwise_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, - (int32_t *)inputs[1]->attr.size, inputs[1]->attr.dim_num, - (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[1]->attr.size, inputs[1]->attr.dim_num, + outputs[0]->attr.size, outputs[0]->attr.dim_num, shapes[0], shapes[1], shapes[2], &new_rank ); } @@ -93,11 +93,11 @@ static vsi_status _eltwise_op_compute if (doShapeOptimized) { reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, - inputs[0], (uint32_t*)shapes[0], new_rank ); + inputs[0], shapes[0], new_rank ); reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, - inputs[1], (uint32_t*)shapes[1], new_rank ); + inputs[1], shapes[1], new_rank ); reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, - outputs[0], (uint32_t*)shapes[2], new_rank ); + outputs[0], shapes[2], new_rank ); } self->n = (vx_node)vsi_nn_kernel_selector( self->graph, @@ -130,7 +130,7 @@ vsi_bool vsi_nn_op_eltwise_setup ) { uint32_t i, j, out_rank, in2_rank; - uint32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_bool ret = TRUE; out_rank = inputs[0]->attr.dim_num; @@ -142,7 +142,7 @@ vsi_bool vsi_nn_op_eltwise_setup for(i = 0; i < out_rank; i++) { - uint32_t sz0, sz1; + vsi_size_t sz0, sz1; sz0 = i < inputs[0]->attr.dim_num ? inputs[0]->attr.size[i] : 1; for ( j = 1; j < self->input.num; j++) @@ -164,18 +164,18 @@ vsi_bool vsi_nn_op_eltwise_setup if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = out_rank; - memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(uint32_t) ); + memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) ); } else { - uint32_t total_size_got; - uint32_t total_size_expected; + vsi_size_t total_size_got; + vsi_size_t total_size_expected; total_size_expected = vsi_nn_ShapeProduct( shape, out_rank ); total_size_got = vsi_nn_ShapeProduct( outputs[0]->attr.size, outputs[0]->attr.dim_num ); if( total_size_expected != total_size_got ) { - VSILOGW("Output size mismatch, expect %d, but got %d", + VSILOGW("Output size mismatch, expect %"VSI_SIZE_T_SPECIFIER", but got %"VSI_SIZE_T_SPECIFIER"", total_size_expected, total_size_got); ret = FALSE; } @@ -332,60 +332,104 @@ static vsi_bool op_check_add { /* check inputs outputs data type */ BEGIN_IO_TYPE_DECL(ADD, 2, 1) - IO_TYPE(D_BF16, D_BF16, D_BF16) - IO_TYPE(D_F32, D_F32, D_BF16) - IO_TYPE(D_BF16, D_BF16, D_F32) - IO_TYPE(D_F16, D_F16, D_F16) - IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) - IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) - IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) - IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) - IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) - IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) - IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_F16) - IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP, D_F16) - IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM, D_F16) - IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM) - IO_TYPE(D_I8, D_I8, D_I8) - IO_TYPE(D_I8, D_I8, D_U8) - IO_TYPE(D_I8, D_I8, D_U8|Q_ASYM) - IO_TYPE(D_F32, D_F32, D_F32) - IO_TYPE(D_F32, D_F32, D_F16) - IO_TYPE(D_F32, D_F16, D_F32) - IO_TYPE(D_F32, D_F16, D_F16) - IO_TYPE(D_F16, D_F32, D_F32) - IO_TYPE(D_F16, D_F32, D_F16) - IO_TYPE(D_F16, D_F16, D_F32) - IO_TYPE(D_I32, D_I32, D_I32) - IO_TYPE(D_I16, D_I32, D_I32) - IO_TYPE(D_I32, D_I16, D_I32) - IO_TYPE(D_I32, D_I32, D_U8|Q_ASYM) - IO_TYPE(D_I32, D_I32, D_I16|Q_DFP) - IO_TYPE(D_I32, D_I32, D_I8|Q_DFP) + IO_TYPE(D_BF16, D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32, D_BF16) + IO_TYPE(D_BF16, D_BF16, D_F32) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_F32, D_F32, D_F16) + IO_TYPE(D_F32, D_F16, D_F32) + IO_TYPE(D_F32, D_F16, D_F16) + IO_TYPE(D_F16, D_F32, D_F32) + IO_TYPE(D_F16, D_F32, D_F16) + IO_TYPE(D_F16, D_F16, D_F32) + IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_I16, D_I32, D_I32) + IO_TYPE(D_I32, D_I16, D_I32) + IO_TYPE(D_I32, D_I32, D_U8|Q_ASYM) + IO_TYPE(D_I32, D_I32, D_I16|Q_DFP) + IO_TYPE(D_I32, D_I32, D_I8|Q_DFP) + + /* HW 9.0.1 */ + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, F32) + + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, F32) + + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, F32) + + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, F32) + + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, F32) + + IO_TYPE(D_BF16, D_BF16, D_I8|Q_DFP) + IO_TYPE(D_BF16, D_BF16, D_U8|Q_ASYM) + IO_TYPE(D_BF16, D_BF16, D_I16|Q_DFP) + IO_TYPE(D_BF16, D_BF16, F16) + IO_TYPE(D_BF16, D_BF16, F32) + + IO_TYPE(D_F16, D_F16, D_BF16) + IO_TYPE(D_F16, D_F16, F32) + + IO_TYPE(D_F32, D_BF16, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_BF16, D_I8|Q_DFP) + IO_TYPE(D_F32, D_BF16, D_I16|Q_DFP) + IO_TYPE(D_F32, D_BF16, D_F16) + IO_TYPE(D_F32, D_BF16, D_BF16) + IO_TYPE(D_F32, D_BF16, F32) + END_IO_TYPE_DECL(ADD) if(!VALIDATE_OP_IO_TYPES(ADD, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, @@ -406,60 +450,11 @@ static vsi_bool op_check_sub ) { /* check inputs outputs data type */ - BEGIN_IO_TYPE_DECL(SUBTRACT, 2, 1) - IO_TYPE(D_BF16, D_BF16, D_BF16) - IO_TYPE(D_F32, D_F32, D_BF16) - IO_TYPE(D_BF16, D_BF16, D_F32) - IO_TYPE(D_F16, D_F16, D_F16) - IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) - IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) - IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) - IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) - IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) - IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) - IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_F16) - IO_TYPE(D_F32, D_F32, D_F32) - IO_TYPE(D_F32, D_F32, D_F16) - IO_TYPE(D_F32, D_F16, D_F32) - IO_TYPE(D_F32, D_F16, D_F16) - IO_TYPE(D_F16, D_F32, D_F32) - IO_TYPE(D_F16, D_F32, D_F16) - IO_TYPE(D_F16, D_F16, D_F32) - IO_TYPE(D_I32, D_I32, D_I32) - IO_TYPE(D_I16, D_I32, D_I32) - IO_TYPE(D_I32, D_I16, D_I32) - END_IO_TYPE_DECL(SUBTRACT) - if(!VALIDATE_OP_IO_TYPES(SUBTRACT, self, inputs, self->input.num, outputs, self->output.num)) { - char* desc = generate_op_io_types_desc(inputs, - self->input.num, outputs, self->output.num); - VSILOGE("Inputs/Outputs data type not support: %s", desc); - destroy_op_io_types_desc(desc); - return FALSE; - } + vsi_bool ret = FALSE; - return TRUE; + ret = vsi_nn_OpCheck(VSI_NN_OP_ADD, self, inputs, outputs); + + return ret; } /* op_check() */ @@ -533,51 +528,98 @@ static vsi_bool op_check_mul { /* check inputs outputs data type */ BEGIN_IO_TYPE_DECL(MULTIPLY, 2, 1) - IO_TYPE(D_BF16, D_BF16, D_BF16) - IO_TYPE(D_F32, D_F32, D_BF16) - IO_TYPE(D_BF16, D_BF16, D_F32) - IO_TYPE(D_F16, D_F16, D_F16) - IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) - IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) - IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) - IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) - IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) - IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) - IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16, D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_F16) - IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP, D_F16) - IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM, D_F16) - IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_F32, D_F32, D_F32) - IO_TYPE(D_F32, D_F32, D_F16) - IO_TYPE(D_F32, D_F16, D_F32) - IO_TYPE(D_F32, D_F16, D_F16) - IO_TYPE(D_F16, D_F32, D_F32) - IO_TYPE(D_F16, D_F32, D_F16) - IO_TYPE(D_F16, D_F16, D_F32) - IO_TYPE(D_I32, D_I32, D_I32) - IO_TYPE(D_I16, D_I32, D_I32) - IO_TYPE(D_I32, D_I16, D_I32) + IO_TYPE(D_BF16, D_BF16, D_BF16) + IO_TYPE(D_F32, D_F32, D_BF16) + IO_TYPE(D_BF16, D_BF16, D_F32) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_F16, D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_F16, D_I16|Q_DFP, D_F16) + IO_TYPE(D_F16, D_I8|Q_DFP, D_F16) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_F16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_F16, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_F16, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16, D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16, D_I16|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_F32, D_F32, D_F16) + IO_TYPE(D_F32, D_F16, D_F32) + IO_TYPE(D_F32, D_F16, D_F16) + IO_TYPE(D_F16, D_F32, D_F32) + IO_TYPE(D_F16, D_F32, D_F16) + IO_TYPE(D_F16, D_F16, D_F32) + IO_TYPE(D_I32, D_I32, D_I32) + IO_TYPE(D_I16, D_I32, D_I32) + IO_TYPE(D_I32, D_I16, D_I32) + + /* HW 9.0.1 */ + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, F32) + + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, F32) + + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, F32) + + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, F32) + + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, F32) + + IO_TYPE(D_BF16, D_BF16, D_I8|Q_DFP) + IO_TYPE(D_BF16, D_BF16, D_U8|Q_ASYM) + IO_TYPE(D_BF16, D_BF16, D_I16|Q_DFP) + IO_TYPE(D_BF16, D_BF16, F16) + IO_TYPE(D_BF16, D_BF16, F32) + + IO_TYPE(D_F16, D_F16, D_BF16) + IO_TYPE(D_F16, D_F16, F32) + + IO_TYPE(D_F32, D_BF16, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_BF16, D_I8|Q_DFP) + IO_TYPE(D_F32, D_BF16, D_I16|Q_DFP) + IO_TYPE(D_F32, D_BF16, D_F16) + IO_TYPE(D_F32, D_BF16, D_BF16) + IO_TYPE(D_F32, D_BF16, F32) + END_IO_TYPE_DECL(MULTIPLY) if(!VALIDATE_OP_IO_TYPES(MULTIPLY, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c index 408d87f..b2a162f 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise_unary.c @@ -89,7 +89,7 @@ static vsi_bool op_setup ) { uint32_t i, out_rank; - uint32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_bool ret = TRUE; out_rank = inputs[0]->attr.dim_num; @@ -101,18 +101,18 @@ static vsi_bool op_setup if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = out_rank; - memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(uint32_t) ); + memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) ); } else { - uint32_t total_size_got; - uint32_t total_size_expected; + vsi_size_t total_size_got; + vsi_size_t total_size_expected; total_size_expected = vsi_nn_ShapeProduct( shape, out_rank ); total_size_got = vsi_nn_ShapeProduct( outputs[0]->attr.size, outputs[0]->attr.dim_num ); if( total_size_expected != total_size_got ) { - VSILOGW("Output size mismatch, expect %d, but got %d", + VSILOGW("Output size mismatch, expect %"VSI_SIZE_T_SPECIFIER", but got %"VSI_SIZE_T_SPECIFIER"", total_size_expected, total_size_got); ret = FALSE; } @@ -196,7 +196,7 @@ extern "C" { { \ return _eltwise_unary_op_compute( ""#kernel_name, self, inputs, outputs ); \ } \ -DEF_OP_REG(name, op_init, op_compute_##kernel_name, vsi_nn_op_common_deinit, op_check, op_setup, NULL, 2, 1) +DEF_OP_REG(name, op_init, op_compute_##kernel_name, vsi_nn_op_common_deinit, op_check, op_setup, NULL, 1, 1) DEF_ELEMENT_WISE_UNARY_OP( SIN, sin ); DEF_ELEMENT_WISE_UNARY_OP( EXP, exp ); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c b/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c index b942f2c..e55e456 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_embedding_lookup.c @@ -52,8 +52,11 @@ static void _reshape_tensor attr.size[2] = input->attr.size[1]; attr.dim_num = 3; } - - *output = vxReshapeTensor( input->t, (int32_t *)attr.size, attr.dim_num ); +#ifdef VSI_40BIT_VA_SUPPORT + *output = vxReshapeTensor( input->t, attr.size, attr.dim_num ); +#else + *output = vxReshapeTensor( input->t, (vx_int32*)attr.size, attr.dim_num ); +#endif } static vsi_status op_compute diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c index 4c19f01..e17292b 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_floordiv.c @@ -51,8 +51,8 @@ static vsi_status op_compute { vsi_status status = VSI_FAILURE; vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; - int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; - uint32_t new_rank = 0; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + vsi_size_t new_rank = 0; vsi_bool ret; if( NULL == self ) @@ -61,19 +61,19 @@ static vsi_status op_compute } ret = vsi_nn_kernel_optimize_eltwise_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, - (int32_t *)inputs[1]->attr.size, inputs[1]->attr.dim_num, - (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[1]->attr.size, inputs[1]->attr.dim_num, + outputs[0]->attr.size, outputs[0]->attr.dim_num, shapes[0], shapes[1], shapes[2], &new_rank ); if( ret ) { reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, - inputs[0], (uint32_t*)shapes[0], new_rank ); + inputs[0], shapes[0], new_rank ); reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, - inputs[1], (uint32_t*)shapes[1], new_rank ); + inputs[1], shapes[1], new_rank ); reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, - outputs[0], (uint32_t*)shapes[2], new_rank ); + outputs[0], shapes[2], new_rank ); self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "floordiv", &reshape_tensors[0], _INPUT_NUM, &reshape_tensors[2], _OUTPUT_NUM, NULL ); @@ -133,7 +133,7 @@ static vsi_bool op_setup ) { uint32_t i, out_rank, in1_rank, in2_rank; - uint32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_bool ret = TRUE; in1_rank = inputs[0]->attr.dim_num; @@ -142,7 +142,7 @@ static vsi_bool op_setup for(i = 0; i < out_rank; i++) { - uint32_t sz0, sz1; + vsi_size_t sz0, sz1; sz0 = i < in1_rank ? inputs[0]->attr.size[i] : 1; sz1 = i < in2_rank ? inputs[1]->attr.size[i] : 1; shape[i] = vsi_nn_max( sz0, sz1 ); @@ -151,18 +151,18 @@ static vsi_bool op_setup if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = out_rank; - memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(uint32_t) ); + memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) ); } else { - uint32_t total_size_got; - uint32_t total_size_expected; + vsi_size_t total_size_got; + vsi_size_t total_size_expected; total_size_expected = vsi_nn_ShapeProduct( shape, out_rank ); total_size_got = vsi_nn_ShapeProduct( outputs[0]->attr.size, outputs[0]->attr.dim_num ); if( total_size_expected != total_size_got ) { - VSILOGW("Output size mismatch, expect %d, but got %d", + VSILOGW("Output size mismatch, expect %"VSI_SIZE_T_SPECIFIER", but got %"VSI_SIZE_T_SPECIFIER"", total_size_expected, total_size_got); ret = FALSE; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c index 7d95b97..ab84c5a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect.c @@ -46,9 +46,9 @@ static vsi_status op_compute vsi_status status; uint32_t axis; uint32_t i = 0; - uint32_t num_fc = 1, num_no_fc = 1; + vsi_size_t num_fc = 1, num_no_fc = 1; uint32_t num_of_intput_dims = 0; - int32_t input_size[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t input_size[VSI_NN_MAX_DIM_NUM] = {0}; uint32_t dims = 0; vx_tensor input = NULL; vx_tensor output = NULL; @@ -57,7 +57,7 @@ static vsi_status op_compute status = VSI_FAILURE; - memcpy(input_size, inputs[0]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); + memcpy(input_size, inputs[0]->attr.size, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM); num_of_intput_dims = inputs[0]->attr.dim_num; axis = inputs[0]->attr.dim_num - 2; @@ -73,7 +73,11 @@ static vsi_status op_compute input_size[0] = num_fc; input_size[1] = num_no_fc; dims= 2; +#ifdef VSI_40BIT_VA_SUPPORT input = vxReshapeTensor(inputs[0]->t, input_size, dims); +#else + input = vxReshapeTensor(inputs[0]->t, (vx_int32*)input_size, dims); +#endif weight = inputs[1]->t; @@ -151,6 +155,146 @@ static vsi_bool op_check IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM, D_F16) IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + + /* HW 9.0.1 */ + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_NONE, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_NONE, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I32|Q_DFP, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP, D_I64|Q_DFP, D_F32) + + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_F16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_NONE, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_NONE, D_F32) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM, D_I32|Q_ASYM, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I32|Q_DFP, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP, D_I64|Q_DFP, D_F32) + + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_F16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_U8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_NONE, D_F32) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I32|Q_DFP, D_F32) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP, D_I64|Q_DFP, D_F32) + + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_NONE, D_F32) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_SYM_PC, D_I32|Q_SYM_PC, D_F32) + + IO_TYPE(D_F16, D_F16, D_NONE, D_BF16) + IO_TYPE(D_F16, D_F16, D_NONE, D_F32) + IO_TYPE(D_F16, D_F16, D_F32, D_BF16) + IO_TYPE(D_F16, D_F16, D_F32, D_F32) + + IO_TYPE(D_BF16, D_BF16, D_NONE, D_F16) + IO_TYPE(D_BF16, D_BF16, D_NONE, D_BF16) + IO_TYPE(D_BF16, D_BF16, D_NONE, D_F32) + IO_TYPE(D_BF16, D_BF16, D_F32, D_F16) + IO_TYPE(D_BF16, D_BF16, D_F32, D_BF16) + IO_TYPE(D_BF16, D_BF16, D_F32, D_F32) + + IO_TYPE(D_F32, D_BF16, D_NONE, D_F16) + IO_TYPE(D_F32, D_BF16, D_NONE, D_BF16) + IO_TYPE(D_F32, D_BF16, D_NONE, D_F32) + IO_TYPE(D_F32, D_BF16, D_F32, D_F16) + IO_TYPE(D_F32, D_BF16, D_F32, D_BF16) + IO_TYPE(D_F32, D_BF16, D_F32, D_F32) + END_IO_TYPE_DECL(FCL) ret = VALIDATE_OP_IO_TYPES(FCL, self, inputs, self->input.num, outputs, self->output.num); if(!ret) { @@ -173,8 +317,8 @@ static vsi_bool op_setup ) { uint32_t dim_num; - uint32_t perm[4] = { 0 }; - uint32_t as_shape[4] = { 0 }; + vsi_size_t perm[4] = { 0 }; + vsi_size_t as_shape[4] = { 0 }; #ifdef VX_CONVERT_POLICY_WRAP_ENABLE if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 ) @@ -285,4 +429,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c index 25e951c..0cdd29d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect2.c @@ -53,14 +53,14 @@ static vsi_status op_compute uint32_t axis; vsi_nn_fcl_param * p; uint32_t i = 0; - uint32_t num_fc = 1, num_no_fc = 1; + vsi_size_t num_fc = 1, num_no_fc = 1; uint32_t num_of_dims[4] = {0}; - int32_t input_size[VSI_NN_MAX_DIM_NUM] = {0}; - int32_t output_size[VSI_NN_MAX_DIM_NUM] = {0}; - int32_t weights_size[VSI_NN_MAX_DIM_NUM] = {0}; - int32_t bias_size[VSI_NN_MAX_DIM_NUM] = {0}; - uint32_t ofm = 0; - uint32_t dims = 0; + vsi_size_t input_size[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t output_size[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t weights_size[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t bias_size[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t ofm = 0; + vsi_size_t dims = 0; vx_tensor input = NULL; vx_tensor output = NULL; vx_tensor weight = NULL; @@ -69,15 +69,15 @@ static vsi_status op_compute p = (vsi_nn_fcl_param *)&(self->nn_param.fcl); axis = p->axis; - memcpy(input_size, inputs[0]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); + memcpy(input_size, inputs[0]->attr.size, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM); num_of_dims[0] = inputs[0]->attr.dim_num; - memcpy(output_size, outputs[0]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); + memcpy(output_size, outputs[0]->attr.size, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM); num_of_dims[1] = outputs[0]->attr.dim_num; - memcpy(weights_size, inputs[1]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); + memcpy(weights_size, inputs[1]->attr.size, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM); num_of_dims[2] = inputs[1]->attr.dim_num; if( inputs[2] != NULL ) { - memcpy(bias_size, inputs[2]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); + memcpy(bias_size, inputs[2]->attr.size, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM); num_of_dims[3] = inputs[2]->attr.dim_num; } @@ -95,25 +95,62 @@ static vsi_status op_compute input_size[0] = num_fc; input_size[1] = num_no_fc; dims= 2; +#ifdef VSI_40BIT_VA_SUPPORT input = vxReshapeTensor(inputs[0]->t, input_size, dims); +#else + { + int32_t input_size_32bit[VSI_NN_MAX_DIM_NUM] = {0}; + for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + input_size_32bit[i] = (int32_t)input_size[i]; + } + input = vxReshapeTensor(inputs[0]->t, input_size_32bit, (uint32_t)dims); + } +#endif weights_size[0] = num_fc; weights_size[1] = ofm; dims= 2; +#ifdef VSI_40BIT_VA_SUPPORT weight = vxReshapeTensor(inputs[1]->t, weights_size, dims); +#else + { + int32_t weight_size_32bit[VSI_NN_MAX_DIM_NUM] = {0}; + for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + weight_size_32bit[i] = (int32_t)weight_size_32bit[i]; + } + weight = vxReshapeTensor(inputs[1]->t, weight_size_32bit, (uint32_t)dims); + } +#endif if( inputs[2] != NULL ) { bias_size[0] = ofm; bias_size[1] = 1; dims= 2; +#ifdef VSI_40BIT_VA_SUPPORT bias = vxReshapeTensor(inputs[2]->t, bias_size, dims); +#else + { + int32_t bias_size_32bit[VSI_NN_MAX_DIM_NUM] = {0}; + for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + bias_size_32bit[i] = (int32_t)bias_size[i]; + } + bias = vxReshapeTensor(inputs[2]->t, bias_size_32bit, (uint32_t)dims); + } +#endif } output_size[0] = ofm; output_size[1] = num_no_fc; dims= 2; +#ifdef VSI_40BIT_VA_SUPPORT output = vxReshapeTensor(outputs[0]->t, output_size, dims); +#else + output = vxReshapeTensor(outputs[0]->t, (vx_int32*)output_size, (uint32_t)dims); +#endif self->n = vxFullyConnectedLayer( self->graph->g, @@ -160,7 +197,7 @@ static vsi_bool op_setup { vsi_nn_fcl_param * p; uint32_t i, j; - uint32_t num_in_fmp = 1; + vsi_size_t num_in_fmp = 1; #ifdef VX_CONVERT_POLICY_WRAP_ENABLE if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 ) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c index 9cbaebd..8766867 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_fullconnect_relu.c @@ -269,9 +269,15 @@ static vsi_bool op_setup inputs[1]->wb = vxCreateWeightsBiasesParameterFromTensors3( VX_CONVOLUTIONAL_NETWORK_FULLYCONNECTED_LAYER, +#ifdef VSI_40BIT_VA_SUPPORT inputs[0]->attr.size, outputs[0]->attr.size, outputs[0]->attr.size, +#else + (vx_uint32*)inputs[0]->attr.size, + (vx_uint32*)outputs[0]->attr.size, + (vx_uint32*)outputs[0]->attr.size, +#endif &p, sizeof(p), (vx_weights_biases_parameter_optimizations_t *)p_opt, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c index d373015..8120757 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather.c @@ -49,9 +49,9 @@ static vsi_status op_compute vsi_nn_kernel_param_t * param = NULL; vsi_nn_kernel_node_t n = NULL; uint32_t i = 0; - uint32_t block_size = 1, block_num = 1, axis_num = 0, indices_num = 1; + vsi_size_t block_size = 1, block_num = 1, axis_num = 0, indices_num = 1; int32_t axis = self->nn_param.gather.axis; - uint32_t *input_size = inputs[0]->attr.size; + vsi_size_t *input_size = inputs[0]->attr.size; uint32_t dims_num = inputs[0]->attr.dim_num; param =vsi_nn_kernel_param_create(); @@ -71,11 +71,11 @@ static vsi_status op_compute indices_num *= inputs[1]->attr.size[i]; } - vsi_nn_kernel_param_add_int32( param, "block_size", block_size ); - vsi_nn_kernel_param_add_int32( param, "block_num", block_num ); - vsi_nn_kernel_param_add_int32( param, "axis_num", axis_num ); - vsi_nn_kernel_param_add_int32( param, "axis", axis ); - vsi_nn_kernel_param_add_int32( param, "indices_num", indices_num ); + vsi_nn_kernel_param_add_int32( param, "block_size", (int32_t)block_size ); + vsi_nn_kernel_param_add_int32( param, "block_num", (int32_t)block_num ); + vsi_nn_kernel_param_add_int32( param, "axis_num", (int32_t)axis_num ); + vsi_nn_kernel_param_add_int32( param, "axis", (int32_t)axis ); + vsi_nn_kernel_param_add_int32( param, "indices_num", (int32_t)indices_num ); n = vsi_nn_kernel_selector( self->graph, "gather", inputs, 2, outputs, 1, param ); if( n != NULL ) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c index 6d55fad..2776150 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gather_nd.c @@ -49,10 +49,10 @@ static vsi_status op_compute vsi_status status = VSI_FAILURE; vsi_nn_kernel_param_t * param = NULL; vsi_nn_kernel_node_t n = NULL; - uint32_t i = 0; - uint32_t block_size = 1, coord_dim = 1; - uint32_t *input_size = inputs[0]->attr.size; - uint32_t dims_num = inputs[0]->attr.dim_num; + vsi_size_t i = 0; + vsi_size_t block_size = 1, coord_dim = 1; + vsi_size_t *input_size = inputs[0]->attr.size; + vsi_size_t dims_num = inputs[0]->attr.dim_num; if(inputs[1]->attr.dim_num > 1) { @@ -71,8 +71,8 @@ static vsi_status op_compute block_size *= input_size[i]; } - vsi_nn_kernel_param_add_int32( param, "block_size", block_size ); - vsi_nn_kernel_param_add_int32( param, "coord_dim", coord_dim ); + vsi_nn_kernel_param_add_int32( param, "block_size", (int32_t)block_size ); + vsi_nn_kernel_param_add_int32( param, "coord_dim", (int32_t)coord_dim ); n = vsi_nn_kernel_selector( self->graph, "gather_nd", inputs, 2, outputs, 1, param ); if( n != NULL ) { @@ -129,11 +129,11 @@ static vsi_bool op_setup ) { /* TODO: Add code to comput outputs' shape. */ - uint32_t i = 0; + vsi_size_t i = 0; if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { - uint32_t j = 0, coord_dim = 1; + vsi_size_t j = 0, coord_dim = 1; if(inputs[1]->attr.dim_num > 1) { coord_dim = inputs[1]->attr.size[0]; @@ -151,7 +151,7 @@ static vsi_bool op_setup { outputs[0]->attr.size[j++] = inputs[1]->attr.size[0]; } - outputs[0]->attr.dim_num = j; + outputs[0]->attr.dim_num = (uint32_t)j; } return TRUE; } /* op_setup() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c b/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c index cc42045..77feaaf 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_generate_proposals.c @@ -93,7 +93,7 @@ static vsi_bool op_setup vsi_nn_generate_proposals_param * p; int32_t num_output_rois; p = &(self->nn_param.generate_proposals); - num_output_rois = vsi_nn_GetElementNum(inputs[0]); + num_output_rois = (int32_t)vsi_nn_GetElementNum(inputs[0]); if(p->pre_nms_top_n > 0) { num_output_rois = p->pre_nms_top_n; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c index d979662..26d3380 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv1d.c @@ -42,17 +42,13 @@ #define _OUTPUT_NUM (1) static vsi_nn_tensor_t * _expand_tensor_dim - ( vsi_nn_graph_t * graph, vsi_nn_tensor_t *tensor, uint32_t * shape, size_t rank, int32_t expand_dim ) + ( vsi_nn_graph_t * graph, vsi_nn_tensor_t *tensor, vsi_size_t * shape, vsi_size_t rank, vsi_size_t expand_dim ) { - uint32_t new_shape[VSI_NN_MAX_DIM_NUM] = { 0 }; - uint32_t i, cnt; - if ( expand_dim < 0 ) + vsi_size_t new_shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t i, cnt; + if ( expand_dim > rank ) { - expand_dim = (int32_t)rank + expand_dim; - } - if ( expand_dim < 0 || (uint32_t)expand_dim > rank ) - { - VSILOGE("Run dim to expand %d, rank is %lu", expand_dim, rank); + VSILOGE("Run dim to expand %"VSI_SIZE_T_SPECIFIER", rank is %"VSI_SIZE_T_SPECIFIER"", expand_dim, rank); return NULL; } for ( i = 0, cnt = 0; i < rank; i ++ ) @@ -70,7 +66,7 @@ static vsi_nn_tensor_t * _expand_tensor_dim new_shape[cnt] = 1; } - return vsi_nn_reshape_tensor( graph, tensor, new_shape, (uint32_t)rank + 1 ); + return vsi_nn_reshape_tensor( graph, tensor, new_shape, rank + 1 ); } /* _expand_tensor_dim() */ static vsi_status op_compute diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c index 32b62af..28a490c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv2d.c @@ -223,7 +223,7 @@ static vsi_bool op_setup { /* TODO: Add code to comput outputs' shape. */ vsi_nn_grouped_conv2d_param *nn_param; - uint32_t perm[] = { 3, 2, 0, 1 }; + vsi_size_t perm[] = { 3, 2, 0, 1 }; /* TODO: Driver should handle this, * Check transpose @@ -243,14 +243,25 @@ static vsi_bool op_setup } nn_param = &self->nn_param.grouped_conv2d; - vsi_nn_compute_padding( - inputs[0]->attr.size, - inputs[1]->attr.size, - nn_param->stride, - nn_param->dilation, - nn_param->pad_type, - nn_param->pad - ); + { + vsi_size_t i, pad[_cnt_of_array(nn_param->pad)] = {0}; + for(i = 0; i < _cnt_of_array(nn_param->pad); i++) + { + pad[i] = self->nn_param.conv2d.pad[i]; + } + vsi_nn_compute_padding( + inputs[0]->attr.size, + inputs[1]->attr.size, + nn_param->stride, + nn_param->dilation, + nn_param->pad_type, + pad + ); + for(i = 0; i < _cnt_of_array(nn_param->pad); i++) + { + self->nn_param.conv2d.pad[i] = (uint32_t)pad[i]; + } + } if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c index a217600..61efd47 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_groupnormalize.c @@ -152,7 +152,7 @@ static vsi_status _op_optimize { uint32_t dim = 0; vsi_nn_groupnorm_lcl_data* local = NULL; - uint32_t shape[VSI_NN_MAX_DIM_NUM]; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM]; char tensor_name[128]; if (_is_3d_group_norm(self, inputs) == FALSE) diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gru.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gru.c new file mode 100644 index 0000000..b3aec6e --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gru.c @@ -0,0 +1,373 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_internal_node.h" +#include "vsi_nn_rnn_helper.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_tensor_op.h" +#include "utils/vsi_nn_util.h" +#include "ops/vsi_nn_op_gru.h" + +typedef struct _vsi_nn_gru_local +{ + void * placeholder; +} vsi_nn_gru_local; + +static void create_state_tensor + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_size_t batch_size, + vsi_size_t hidden_size + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t * tensor = NULL; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + + if(NULL == outputs[GRU_OUT_H_STATE]) + { + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(attr.size[0])); + attr.dim_num = VSI_NN_DIM_AUTO; + memcpy( &attr.dtype, &outputs[GRU_OUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) ); + attr.vtl = TRUE; + attr.is_const = FALSE; + tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + outputs[GRU_OUT_H_STATE] = tensor->t; + } + + if(NULL == inputs[GRU_IN_H_STATE]) + { + attr.dim_num = 2; + attr.size[0] = hidden_size; + attr.size[1] = batch_size; + memcpy(&attr.dtype, &outputs[GRU_OUT_H_STATE]->attr.dtype, sizeof( attr.dtype )); + attr.vtl = FALSE; + attr.is_const = TRUE; + + tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + inputs[GRU_IN_H_STATE] = tensor->t; + } + +} /* create_state_tensor() */ + +static vsi_bool setup_op_shapes + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_gru_param * p = &self->nn_param.gru; + vsi_size_t batch_size = 0, hidden_size = 0, timesetp = 0; + + hidden_size = p->num_units; + if(p->time_major) + { + /* [input_size, batch_size, timestep] */ + batch_size = inputs[GRU_IN_INPUT]->attr.size[1]; + timesetp = inputs[GRU_IN_INPUT]->attr.size[2]; + } + else + { + /* [input_size, timestep, batch_size] */ + batch_size = inputs[GRU_IN_INPUT]->attr.size[2]; + timesetp = inputs[GRU_IN_INPUT]->attr.size[1]; + } + + /* setup grucell output tensors' shape */ + /* output */ + if(VSI_NN_DIM_AUTO == outputs[GRU_OUT_OUTPUT]->attr.dim_num) + { + outputs[GRU_OUT_OUTPUT]->attr.size[0] = hidden_size; + if(p->return_sequences) + { + outputs[GRU_OUT_OUTPUT]->attr.dim_num = 3; + if(p->time_major) + { + outputs[GRU_OUT_OUTPUT]->attr.size[1] = batch_size; + outputs[GRU_OUT_OUTPUT]->attr.size[2] = timesetp; + } + else + { + outputs[GRU_OUT_OUTPUT]->attr.size[2] = batch_size; + outputs[GRU_OUT_OUTPUT]->attr.size[1] = timesetp; + } + } + else + { + outputs[GRU_OUT_OUTPUT]->attr.dim_num = 2; + outputs[GRU_OUT_OUTPUT]->attr.size[1] = batch_size; + } + + } + + /* create hstate input/output if app doesn't provide them */ + create_state_tensor(self, inputs, outputs, batch_size, hidden_size); + + /* hstate output */ + if(VSI_NN_DIM_AUTO == outputs[GRU_OUT_H_STATE]->attr.dim_num) + { + outputs[GRU_OUT_H_STATE]->attr.dim_num = 2; + outputs[GRU_OUT_H_STATE]->attr.size[0] = hidden_size; + outputs[GRU_OUT_H_STATE]->attr.size[1] = batch_size; + } + + return TRUE; +} /* setup_op_shapes() */ + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return TRUE; +} + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_size_t i = 0, timestep = 0, batch_size = 0; + vsi_bool use_virtual_tensor = TRUE; + vsi_nn_gru_param * p = &self->nn_param.gru; + vsi_nn_internal_node_t * curr = NULL; + vsi_nn_tensor_t * input_tensor = NULL, * output_tensor = NULL; + vsi_nn_tensor_t * step_h_state = NULL; + vsi_nn_tensor_t ** split_outputs = NULL; + vsi_nn_tensor_t ** gru_step_outputs = NULL; + vsi_nn_internal_tensor_t * tmp_tensor = NULL; + vsi_nn_tensor_attr_t attr; + + memset(&attr, 0, sizeof(attr)); + vsi_nn_internal_init_node_wksp( self ); + + if(p->time_major) /* [input_size, batch, timestep] */ + { + timestep = inputs[GRU_IN_INPUT]->attr.size[2]; + batch_size = inputs[GRU_IN_INPUT]->attr.size[1]; + } + else /* [input_size, timestep, batch] */ + { + timestep = inputs[GRU_IN_INPUT]->attr.size[1]; + batch_size = inputs[GRU_IN_INPUT]->attr.size[2]; + } + + /* compute output shapes and initial the state tensor if needed */ + setup_op_shapes(self, inputs, outputs); + + input_tensor = inputs[GRU_IN_INPUT]; + if(FALSE == p->time_major) + { + /* transpose to time_major */ + tmp_tensor = vsi_nn_rnn_transpose_time_major(self, + inputs[GRU_INPUT_INPUT], NULL, use_virtual_tensor); + input_tensor = tmp_tensor->t; + } + + split_outputs = (vsi_nn_tensor_t **)malloc(timestep * sizeof(vsi_nn_tensor_t *)); + memset(split_outputs, 0, timestep * sizeof(vsi_nn_tensor_t *)); + gru_step_outputs = (vsi_nn_tensor_t **)malloc(timestep * sizeof(vsi_nn_tensor_t *)); + memset(gru_step_outputs, 0, timestep * sizeof(vsi_nn_tensor_t *)); + + vsi_nn_rnn_split_input_tensor(self, input_tensor, split_outputs, (uint32_t)timestep, use_virtual_tensor); + + //vsi_nn_rnn_data_check_aligned(self, split_outputs, timestep, use_virtual_tensor); ?? + + step_h_state = inputs[GRU_IN_H_STATE]; + for(i = 0; i < timestep; i++) + { + vsi_nn_tensor_t * reshape_output = NULL; + vsi_nn_tensor_t * cell_out0 = NULL; + vsi_nn_tensor_t * cell_out1 = NULL; + + /* reshape split_outputs to cell_input */ + tmp_tensor = vsi_nn_rnn_reshape_split_output( + self, split_outputs[i], (uint32_t)batch_size, use_virtual_tensor); + reshape_output = tmp_tensor->t; + + /* grucell output */ + if ( (i == timestep - 1) && p->return_sequences == FALSE ) + { + cell_out0 = outputs[GRU_OUT_OUTPUT]; + } + else + { + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[GRU_OUT_OUTPUT]->attr.dtype, use_virtual_tensor); + tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + cell_out0 = tmp_tensor->t; + } + + /* grucell output h_state */ + if( i != timestep - 1 ) + { + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[GRU_OUTPUT_H_STATE]->attr.dtype, use_virtual_tensor); + tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + cell_out1 = tmp_tensor->t; + } + else + { + cell_out1 = outputs[GRU_OUT_H_STATE]; + } + + /* create a grucell */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL, 0, 0 ); + curr->node->nn_param.grucell.num_units = p->num_units; + curr->node->nn_param.grucell.activation = p->activation; + curr->node->nn_param.grucell.recurrent_activation = p->recurrent_activation; + curr->node->nn_param.grucell.reset_after = p->reset_after; + curr->inputs[GRUCELL_IN_INPUT] = reshape_output; + curr->inputs[GRUCELL_IN_H_STATE] = step_h_state; + curr->inputs[GRUCELL_IN_KERNEL_I2Z] = inputs[GRU_IN_KERNEL_I2Z]; + curr->inputs[GRUCELL_IN_KERNEL_I2R] = inputs[GRU_IN_KERNEL_I2R]; + curr->inputs[GRUCELL_IN_KERNEL_I2H] = inputs[GRU_IN_KERNEL_I2H]; + curr->inputs[GRUCELL_IN_KERNEL_R2Z] = inputs[GRU_IN_KERNEL_R2Z]; + curr->inputs[GRUCELL_IN_KERNEL_R2R] = inputs[GRU_IN_KERNEL_R2R]; + curr->inputs[GRUCELL_IN_KERNEL_R2H] = inputs[GRU_IN_KERNEL_R2H]; + curr->inputs[GRUCELL_IN_BIAS_I2Z] = inputs[GRU_IN_BIAS_I2Z]; + curr->inputs[GRUCELL_IN_BIAS_I2R] = inputs[GRU_IN_BIAS_I2R]; + curr->inputs[GRUCELL_IN_BIAS_I2H] = inputs[GRU_IN_BIAS_I2H]; + curr->inputs[GRUCELL_IN_BIAS_R2Z] = inputs[GRU_IN_BIAS_R2Z]; + curr->inputs[GRUCELL_IN_BIAS_R2R] = inputs[GRU_IN_BIAS_R2R]; + curr->inputs[GRUCELL_IN_BIAS_R2H] = inputs[GRU_IN_BIAS_R2H]; + curr->outputs[GRUCELL_OUT_OUTPUT] = cell_out0; + curr->outputs[GRUCELL_OUT_H_STATE] = cell_out1; + vsi_nn_internal_setup_node( self, curr ); + + step_h_state = cell_out1; + + if(p->return_sequences) + { + /* reshape every step output to 3-dims for GRU_OUTPUT */ + tmp_tensor = vsi_nn_rnn_reshape_cell_output(self, + cell_out0, (uint32_t)batch_size, use_virtual_tensor); + gru_step_outputs[i] = tmp_tensor->t; + } + } /* for(i = 0; i < timestep; i++) end */ + + if(p->return_sequences) + { + output_tensor = outputs[GRU_OUTPUT_OUTPUT]; + if(p->time_major == FALSE) + { + /* create a new tensor for permute */ + vsi_nn_internal_init_tensor_attr(&attr, + &outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); + tmp_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + output_tensor = tmp_tensor->t; + } + + /* concat all grucell output0, the reshaped grucell output shape: [hidden_size, batch, 1] */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, timestep, 1 ); + curr->node->nn_param.concat.axis = 2; /* concat the cell_outs in timestep */ + for( i = 0; i < timestep; i++ ) + { + curr->inputs[i] = gru_step_outputs[i]; + } + curr->outputs[0] = output_tensor; + vsi_nn_internal_setup_node( self, curr ); + + if(p->time_major == FALSE) + { + /* transpose time_major to batch_major */ + vsi_nn_rnn_transpose_time_major(self, + output_tensor, outputs[GRU_OUTPUT_OUTPUT], use_virtual_tensor); + } + } + + vsi_nn_safe_free( split_outputs ); + vsi_nn_safe_free( gru_step_outputs ); + + return TRUE; +} + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + return VSI_SUCCESS; +} + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} + + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ GRU, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ GRU_IN_CNT, + /* output_num */ GRU_OUT_CNT + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c index d4ac7a2..227650d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_gru_ovxlib.c @@ -59,9 +59,9 @@ static vsi_bool setup_op_shapes vsi_nn_gru_ovxlib_param* curr_param = &self->nn_param.gru_ovxlib; vsi_nn_tensor_attr_t attr; vsi_nn_internal_tensor_t* output_tensor = NULL; - uint32_t num_units = 0; - uint32_t output_size = 0; - uint32_t batch_size = 0; + vsi_size_t num_units = 0; + vsi_size_t output_size = 0; + vsi_size_t batch_size = 0; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); if( curr_param->time_major ) @@ -97,7 +97,7 @@ static vsi_bool setup_op_shapes if( !outputs[GRU_OUTPUT_H_STATE] ) { - memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); attr.dim_num = VSI_NN_DIM_AUTO; memcpy( &attr.dtype, &outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) ); attr.vtl = TRUE; @@ -182,9 +182,9 @@ static vsi_bool op_setup_default vsi_nn_tensor_t* tensor = NULL; vsi_nn_tensor_t* input_tensor = NULL; vsi_bool use_virtual_tensor = TRUE; - uint32_t batch_size = 0; - uint32_t time_step = 0; - uint32_t i = 0; + vsi_size_t batch_size = 0; + vsi_size_t time_step = 0; + vsi_size_t i = 0; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_node_wksp( self ); @@ -218,9 +218,9 @@ static vsi_bool op_setup_default grucell_reshape_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **)); memset( grucell_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); - vsi_nn_rnn_split_input_tensor(self, input_tensor, split_output_tensors, time_step, use_virtual_tensor); + vsi_nn_rnn_split_input_tensor(self, input_tensor, split_output_tensors, (uint32_t)time_step, use_virtual_tensor); - vsi_nn_rnn_data_check_aligned(self, split_output_tensors, time_step, use_virtual_tensor); + vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor); last_step_h_state = inputs[GRU_INPUT_H_STATE]; for( i = 0; i < time_step; i++ ) @@ -231,7 +231,7 @@ static vsi_bool op_setup_default /* reshape for split output */ output_tensor = vsi_nn_rnn_reshape_split_output(self, - split_output_tensors[i], batch_size, use_virtual_tensor); + split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor); reshape_output = output_tensor->t; /* grucell output */ @@ -268,7 +268,7 @@ static vsi_bool op_setup_default if ( reshape_output->attr.dtype.vx_type == VSI_NN_TYPE_BFLOAT16 ) { int32_t k = 0; - for (k = 0; k < sizeof( curr_param->internal_dtype ) / sizeof(curr_param->internal_dtype[0]); k++) + for (k = 0; k < _cnt_of_array( curr_param->internal_dtype ); k++) { if (curr_param->internal_dtype[k].vx_type == VSI_NN_TYPE_NONE) { @@ -311,7 +311,7 @@ static vsi_bool op_setup_default { /* reshape output to 3-dims */ output_tensor = vsi_nn_rnn_reshape_cell_output(self, - grucell_out0, batch_size, use_virtual_tensor); + grucell_out0, (uint32_t)batch_size, use_virtual_tensor); grucell_reshape_output_tensors[i] = output_tensor->t; } } @@ -329,7 +329,7 @@ static vsi_bool op_setup_default } /* concat grucell output, the gru's output is 3-dims */ - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 ); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 ); curr->node->nn_param.concat.axis = 2; for( i = 0; i < time_step; i++ ) { @@ -369,15 +369,15 @@ static vsi_bool op_setup_optimized vsi_nn_tensor_t* last_step_h_state = NULL; vsi_nn_tensor_t* input_tensor = NULL; vsi_bool use_virtual_tensor = TRUE; - uint32_t batch_size = 0; - uint32_t time_step = 0; - uint32_t unit_nums = 0; - uint32_t i = 0; + vsi_size_t batch_size = 0; + vsi_size_t time_step = 0; + vsi_size_t unit_nums = 0; + vsi_size_t i = 0; grucell_activation_input_layout_e grucell_activation_input_layout = GRUCELL_ACTIVATION_INPUT_LAYOUT_ALL_CN; vsi_nn_internal_tensor_t* recurrent_weight_for_nn = NULL; vsi_nn_internal_tensor_t* input_weight_for_nn = NULL; - uint32_t permute_in_perm[VSI_NN_MAX_DIM_NUM] = { 0 }; - uint32_t reshape_size[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t permute_in_perm[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t reshape_size[VSI_NN_MAX_DIM_NUM] = { 0 }; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_node_wksp( self ); @@ -466,7 +466,7 @@ static vsi_bool op_setup_optimized curr->node->nn_param.conv2d.group = 1; curr->node->nn_param.conv2d.dilation[0] = 1; curr->node->nn_param.conv2d.dilation[1] = 1; - curr->node->nn_param.conv2d.weights = input_weight_for_nn->t->attr.size[3]; + curr->node->nn_param.conv2d.weights = (uint32_t)(input_weight_for_nn->t->attr.size[3]); curr->inputs[0] = tmp_tensor->t; curr->inputs[1] = input_weight_for_nn->t; @@ -490,9 +490,9 @@ static vsi_bool op_setup_optimized grucell_reshape_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * sizeof(vsi_nn_tensor_t **)); memset( grucell_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); - vsi_nn_rnn_split_input_tensor(self, tmp_tensor->t, split_output_tensors, time_step, use_virtual_tensor); + vsi_nn_rnn_split_input_tensor(self, tmp_tensor->t, split_output_tensors, (uint32_t)time_step, use_virtual_tensor); - vsi_nn_rnn_data_check_aligned(self, split_output_tensors, time_step, use_virtual_tensor); + vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor); memcpy(&attr, &p->local->bias_r->attr, sizeof(vsi_nn_tensor_attr_t)); attr.size[1] = 1; @@ -517,7 +517,7 @@ static vsi_bool op_setup_optimized /* reshape for split output */ output_tensor = vsi_nn_rnn_reshape_split_output(self, - split_output_tensors[i], unit_nums * 3, use_virtual_tensor); + split_output_tensors[i], (uint32_t)(unit_nums * 3), use_virtual_tensor); input_fc_output = output_tensor->t; /* last_step_h_state is not batch first, no need to permute */ @@ -544,7 +544,7 @@ static vsi_bool op_setup_optimized curr->node->nn_param.conv2d.group = 1; curr->node->nn_param.conv2d.dilation[0] = 1; curr->node->nn_param.conv2d.dilation[1] = 1; - curr->node->nn_param.conv2d.weights = recurrent_weight_for_nn->t->attr.size[3]; + curr->node->nn_param.conv2d.weights = (uint32_t)recurrent_weight_for_nn->t->attr.size[3]; curr->inputs[0] = tmp->t; curr->inputs[1] = recurrent_weight_for_nn->t; @@ -617,7 +617,7 @@ static vsi_bool op_setup_optimized &outputs[GRU_OUTPUT_OUTPUT]->attr.dtype, use_virtual_tensor); tmp_tensor = vsi_nn_internal_new_tensor(self, &attr, 0.0f); - curr = vsi_nn_internal_new_node(self, VSI_NN_OP_CONCAT, time_step, 1); + curr = vsi_nn_internal_new_node(self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1); curr->node->nn_param.concat.axis = 1; for( i = 0; i < time_step; i++ ) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c new file mode 100644 index 0000000..a007884 --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell.c @@ -0,0 +1,353 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_internal_node.h" +#include "vsi_nn_rnn_helper.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_tensor_op.h" +#include "utils/vsi_nn_util.h" +#include "ops/vsi_nn_op_grucell.h" + +typedef struct _vsi_nn_grucell_local +{ + void * placeholder; +} vsi_nn_grucell_local; + +static vsi_nn_internal_tensor_t * _create_fc + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input, + vsi_nn_tensor_t * weight, + vsi_nn_tensor_t * bias + ) +{ + vsi_nn_tensor_attr_t attr; + vsi_nn_internal_tensor_t * fc_out = NULL; + vsi_nn_internal_tensor_t * tmp_tensor = NULL; + vsi_nn_tensor_t * bias_tensor = NULL; + vsi_nn_internal_node_t* fc_node = NULL; + + memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); + if(NULL == bias) + { + /* create zero bias for NN/TP */ + tmp_tensor = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr, VSI_NN_OP_FCL, FALSE); + bias_tensor = tmp_tensor->t; + } + else + { + bias_tensor = bias; + } + + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; + attr.dim_num = VSI_NN_DIM_AUTO; + attr.vtl = TRUE; + attr.is_const = FALSE; + fc_out = vsi_nn_internal_new_tensor(self, &attr, 0.0f); + + fc_node = vsi_nn_internal_new_node(self, VSI_NN_OP_FCL, 0, 0 ); + fc_node->node->nn_param.fcl.axis = 0; + fc_node->node->nn_param.fcl.weights = (uint32_t)weight->attr.size[1]; + fc_node->inputs[0] = input; + fc_node->inputs[1] = weight; + fc_node->inputs[2] = bias_tensor; + fc_node->outputs[0] = fc_out->t; + vsi_nn_internal_setup_node(self, fc_node); + + return fc_out; +} /* () */ + +/* + copmute the recurrent hstate gates + equations: + reset_after == True: + ht = FC(hstate, kernel_rh, bias_rh) + ht = rt * ht + reset_after == False: + ht = rt * hstate + ht = FC(ht, kernel_rh, bias_rh) +*/ +static vsi_nn_internal_tensor_t * _compute_ht + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input_rt, + vsi_nn_tensor_t * hstate, + vsi_nn_tensor_t * weight, + vsi_nn_tensor_t * bias + ) +{ + vsi_bool use_virtual_tensor = TRUE; + vsi_nn_grucell_param * p = &self->nn_param.grucell; + vsi_nn_internal_tensor_t * tensor1 = NULL, * tensor2 = NULL; + + if(p->reset_after == TRUE) + { + tensor1 = _create_fc( + self, + hstate, + weight, + bias + ); + tensor2 = vsi_nn_rnn_create_binary_operator( + self, + VSI_NN_OP_MULTIPLY, + input_rt, + tensor1->t, + &input_rt->attr.dtype, + use_virtual_tensor + ); + } + else + { + tensor1 = vsi_nn_rnn_create_binary_operator( + self, + VSI_NN_OP_MULTIPLY, + input_rt, + hstate, + &input_rt->attr.dtype, + use_virtual_tensor + ); + tensor2 = _create_fc( + self, + tensor1->t, + weight, + bias + ); + } + + return tensor2; +} /* _compute_ht() */ + +/* + compute the recurrent update gates or reset gates + equations: + xt = FC(hstate, kernel_xt, bias_xt) + xt = input_xt + xt + xt = recurrent_activation(xt) +*/ +static vsi_nn_internal_tensor_t * _compute_recurrent_gate + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t * input_xt, + vsi_nn_tensor_t * hstate, + vsi_nn_tensor_t * weight, + vsi_nn_tensor_t * bias + ) +{ + vsi_bool use_virtual_tensor = TRUE; + vsi_nn_grucell_param * p = &self->nn_param.grucell; + vsi_nn_internal_tensor_t * tensor_add = NULL, * tensor_act; + vsi_nn_internal_tensor_t * recurrent_fc_out = NULL; + + recurrent_fc_out = _create_fc(self, hstate, weight, bias); + + tensor_add = vsi_nn_rnn_create_binary_operator( + self, + VSI_NN_OP_ADD, + recurrent_fc_out->t, + input_xt, + &recurrent_fc_out->t->attr.dtype, + use_virtual_tensor + ); + + tensor_act = vsi_nn_rnn_create_activation( + self, + tensor_add->t, + p->recurrent_activation, + &tensor_add->t->attr.dtype, + use_virtual_tensor + ); + + return tensor_act; +} /* _compute_recurrent_gate */ + +static vsi_bool setup_op_shapes + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_nn_grucell_param * p = &self->nn_param.grucell; + vsi_size_t hidden_size = p->num_units; + vsi_size_t batch_size = inputs[GRUCELL_IN_INPUT]->attr.size[1]; + + /* setup grucell output tensors' shape */ + /* output */ + if(VSI_NN_DIM_AUTO == outputs[GRUCELL_OUT_OUTPUT]->attr.dim_num) + { + outputs[GRUCELL_OUT_OUTPUT]->attr.dim_num = 2; + outputs[GRUCELL_OUT_OUTPUT]->attr.size[0] = hidden_size; + outputs[GRUCELL_OUT_OUTPUT]->attr.size[1] = batch_size; + } + + /* hstate output */ + if(VSI_NN_DIM_AUTO == outputs[GRUCELL_OUT_H_STATE]->attr.dim_num) + { + outputs[GRUCELL_OUT_H_STATE]->attr.dim_num = 2; + outputs[GRUCELL_OUT_H_STATE]->attr.size[0] = hidden_size; + outputs[GRUCELL_OUT_H_STATE]->attr.size[1] = batch_size; + } + + return TRUE; +} /* setup_op_shapes() */ + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} /* op_compute() */ + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return TRUE; +} + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + return VSI_SUCCESS; +} + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + uint32_t i; + vsi_nn_internal_node_t * curr = NULL; + vsi_nn_grucell_param * p = &self->nn_param.grucell; + vsi_nn_internal_tensor_t * input_fc_outputs[GRUCELL_GATE_CNT] = { NULL }; + vsi_nn_internal_tensor_t * zt = NULL, * rt = NULL, * ht = NULL; + + vsi_nn_internal_init_node_wksp( self ); + + /* compute output tensor's shapes */ + setup_op_shapes(self, inputs, outputs); + + /* create input fc */ + for(i = 0; i < GRUCELL_GATE_CNT; i++) + { + input_fc_outputs[i] = _create_fc( + self, + inputs[GRUCELL_IN_INPUT], + inputs[GRUCELL_IN_KERNEL_I2Z + i], + inputs[GRUCELL_IN_BIAS_I2Z + i] + ); + } + + /* compute update gate and reset gate */ + zt = _compute_recurrent_gate( + self, + input_fc_outputs[GRUCELL_GATES_Z]->t, + inputs[GRUCELL_IN_H_STATE], + inputs[GRUCELL_IN_KERNEL_R2Z], + inputs[GRUCELL_IN_BIAS_R2Z] + ); + rt = _compute_recurrent_gate( + self, + input_fc_outputs[GRUCELL_GATES_R]->t, + inputs[GRUCELL_IN_H_STATE], + inputs[GRUCELL_IN_KERNEL_R2R], + inputs[GRUCELL_IN_BIAS_R2R] + ); + + /* compute recurrent h with parameter 'reset_after' */ + ht = _compute_ht( + self, + rt->t, + inputs[GRUCELL_IN_H_STATE], + inputs[GRUCELL_IN_KERNEL_R2H], + inputs[GRUCELL_IN_BIAS_R2H] + ); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_GRUCELL_ACTIVATION, 0, 0 ); + curr->node->nn_param.grucell_activation.activation = p->activation; + curr->inputs[GRUCELL_ACT_IN_H_STATE] = inputs[GRUCELL_IN_H_STATE]; + curr->inputs[GRUCELL_ACT_IN_INPUT_FC_H] = input_fc_outputs[GRUCELL_GATES_H]->t; + curr->inputs[GRUCELL_ACT_IN_H_T] = ht->t; + curr->inputs[GRUCELL_ACT_IN_Z_T] = zt->t; + curr->outputs[GRUCELL_ACT_OUT_OUTPUT] = outputs[GRUCELL_OUT_OUTPUT]; + curr->outputs[GRUCELL_ACT_OUT_H_STATE] = outputs[GRUCELL_OUT_H_STATE]; + vsi_nn_internal_setup_node(self, curr); + + return TRUE; +} /* op_setup() */ + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ GRUCELL, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ GRUCELL_IN_CNT, + /* output_num */ GRUCELL_OUT_CNT + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation.c new file mode 100644 index 0000000..2af4c6e --- /dev/null +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation.c @@ -0,0 +1,180 @@ +/**************************************************************************** +* +* Copyright (c) 2020 Vivante Corporation +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +* DEALINGS IN THE SOFTWARE. +* +*****************************************************************************/ +#include +#include + +#include "vsi_nn_types.h" +#include "vsi_nn_platform.h" +#include "vsi_nn_log.h" +#include "vsi_nn_graph.h" +#include "vsi_nn_node.h" +#include "vsi_nn_prv.h" +#include "vsi_nn_ops.h" +#include "vsi_nn_tensor.h" +#include "vsi_nn_tensor_util.h" +#include "vsi_nn_internal_node.h" +#include "vsi_nn_rnn_helper.h" +#include "utils/vsi_nn_math.h" +#include "utils/vsi_nn_tensor_op.h" +#include "utils/vsi_nn_util.h" +#include "ops/vsi_nn_op_grucell_activation.h" + +typedef struct _vsi_nn_grucell_activation_local { + void * placeholder; +} vsi_nn_grucell_activation_local; + +static vsi_status op_compute + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return vsi_nn_internal_compute_node( self ); +} + +static vsi_bool op_check + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + return TRUE; +} + +static vsi_bool op_setup + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs + ) +{ + vsi_bool use_virtual_tensor= TRUE; + vsi_nn_grucell_activation_param * p = &self->nn_param.grucell_activation; + vsi_nn_internal_tensor_t * tmp_sub = NULL, * tmp_add = NULL, * tmp_mul = NULL; + vsi_nn_internal_tensor_t * tmp_act = NULL; + vsi_nn_internal_node_t * curr = NULL; + + vsi_nn_internal_init_node_wksp( self ); + + if(VSI_NN_DIM_AUTO == outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dim_num) + { + outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.dim_num = 2; + outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.size[0] = inputs[GRUCELL_ACT_IN_H_STATE]->attr.size[0]; + outputs[GRUCELL_ACT_OUT_OUTPUT]->attr.size[1] = inputs[GRUCELL_ACT_IN_H_STATE]->attr.size[1]; + } + + /* + hht = activation(fc_h + ht) + */ + tmp_add = vsi_nn_rnn_create_binary_operator( + self, + VSI_NN_OP_ADD, + inputs[GRUCELL_ACT_IN_INPUT_FC_H], + inputs[GRUCELL_ACT_IN_H_T], + &inputs[GRUCELL_ACT_IN_INPUT_FC_H]->attr.dtype, + use_virtual_tensor + ); + tmp_act = vsi_nn_rnn_create_activation( + self, + tmp_add->t, + p->activation, + &tmp_add->t->attr.dtype, + use_virtual_tensor + ); + + /* + new_h = zt * (hstate - hht) + hht + */ + tmp_sub = vsi_nn_rnn_create_binary_operator( + self, + VSI_NN_OP_SUBTRACT, + inputs[GRUCELL_ACT_IN_H_STATE], + tmp_act->t, + &tmp_act->t->attr.dtype, + use_virtual_tensor + ); + tmp_mul = vsi_nn_rnn_create_binary_operator( + self, + VSI_NN_OP_MULTIPLY, + inputs[GRUCELL_ACT_IN_Z_T], + tmp_sub->t, + &tmp_sub->t->attr.dtype, + use_virtual_tensor + ); + + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_ADD, 0, 0 ); + curr->inputs[0] = tmp_mul->t; + curr->inputs[1] = tmp_act->t; + curr->outputs[0] = outputs[GRUCELL_ACT_OUT_OUTPUT]; + vsi_nn_internal_setup_node(self, curr); + + /* copy outputs to h_state */ + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 0, 0 ); + curr->inputs[0] = outputs[GRUCELL_ACT_OUT_OUTPUT]; + curr->outputs[0] = outputs[GRUCELL_ACT_OUT_H_STATE]; + vsi_nn_internal_setup_node(self, curr); + + return TRUE; +} + +static vsi_status op_deinit + ( + vsi_nn_node_t * self + ) +{ + return VSI_SUCCESS; +} + +static vsi_status op_optimize + ( + vsi_nn_node_t * self, + vsi_nn_tensor_t ** inputs, + vsi_nn_tensor_t ** outputs, + vsi_nn_opt_direction_e direction + ) +{ + return vsi_nn_internal_optimize_node( self, direction ); +} + +#ifdef __cplusplus +extern "C" { +#endif +/* Registrar */ +DEF_OP_REG + ( + /* op_name */ GRUCELL_ACTIVATION, + /* init */ NULL, + /* compute */ op_compute, + /* deinit */ op_deinit, + /* check */ op_check, + /* setup */ op_setup, + /* optimize */ op_optimize, + /* input_num */ GRUCELL_ACT_IN_CNT, + /* output_num */ GRUCELL_ACT_OUT_CNT + ); +#ifdef __cplusplus +} +#endif diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal.c index 92a72c2..42fc9fb 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal.c @@ -100,7 +100,7 @@ static vsi_bool op_setup inputs[GRUCELL_ACTIVATION_INPUT_ZT_]->attr.dim_num; memcpy( outputs[GRUCELL_ACTIVATION_OUTPUT_OUTPUT]->attr.size, inputs[GRUCELL_ACTIVATION_INPUT_ZT_]->attr.size, - inputs[GRUCELL_ACTIVATION_INPUT_ZT_]->attr.dim_num * sizeof( uint32_t ) ); + inputs[GRUCELL_ACTIVATION_INPUT_ZT_]->attr.dim_num * sizeof(vsi_size_t) ); } else { @@ -148,7 +148,7 @@ static vsi_bool op_setup inputs[GRUCELL_ACTIVATION_OUTPUT_OUTPUT]->attr.dim_num; memcpy( outputs[GRUCELL_ACTIVATION_OUTPUT_H_STATE]->attr.size, inputs[GRUCELL_ACTIVATION_OUTPUT_OUTPUT]->attr.size, - inputs[GRUCELL_ACTIVATION_OUTPUT_OUTPUT]->attr.dim_num * sizeof( uint32_t ) ); + inputs[GRUCELL_ACTIVATION_OUTPUT_OUTPUT]->attr.dim_num * sizeof(vsi_size_t) ); } return TRUE; } /* op_setup() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal_sma.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal_sma.c index 8feedf7..ba9b540 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal_sma.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_activation_internal_sma.c @@ -87,7 +87,7 @@ static vsi_bool op_setup inputs[GRUCELL_ACTIVATION_SMA_INPUT_H_STATE]->attr.dim_num; memcpy( outputs[GRUCELL_ACTIVATION_SMA_OUTPUT_OUTPUT]->attr.size, inputs[GRUCELL_ACTIVATION_SMA_INPUT_H_STATE]->attr.size, - inputs[GRUCELL_ACTIVATION_SMA_INPUT_H_STATE]->attr.dim_num * sizeof( uint32_t ) ); + inputs[GRUCELL_ACTIVATION_SMA_INPUT_H_STATE]->attr.dim_num * sizeof(vsi_size_t) ); } if(VSI_NN_DIM_AUTO == outputs[GRUCELL_ACTIVATION_SMA_OUTPUT_H_STATE]->attr.dim_num) @@ -96,7 +96,7 @@ static vsi_bool op_setup inputs[GRUCELL_ACTIVATION_SMA_OUTPUT_OUTPUT]->attr.dim_num; memcpy( outputs[GRUCELL_ACTIVATION_SMA_OUTPUT_H_STATE]->attr.size, inputs[GRUCELL_ACTIVATION_SMA_OUTPUT_OUTPUT]->attr.size, - inputs[GRUCELL_ACTIVATION_SMA_OUTPUT_OUTPUT]->attr.dim_num * sizeof( uint32_t ) ); + inputs[GRUCELL_ACTIVATION_SMA_OUTPUT_OUTPUT]->attr.dim_num * sizeof(vsi_size_t) ); } return TRUE; } /* op_setup() */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c index c1d60b6..1814c51 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grucell_ovxlib.c @@ -102,8 +102,8 @@ static vsi_bool setup_op_shapes vsi_nn_grucell_ovxlib_param* curr_param = &self->nn_param.grucell_ovxlib; vsi_nn_tensor_attr_t attr; vsi_nn_internal_tensor_t* output_tensor = NULL; - uint32_t output_size = 0; - uint32_t batch_size = 0; + vsi_size_t output_size = 0; + vsi_size_t batch_size = 0; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); batch_size = inputs[GRUCELL_INPUT_INPUT]->attr.size[1]; @@ -152,7 +152,7 @@ static vsi_bool setup_op_shapes { outputs[GRUCELL_OUTPUT_H_STATE]->attr.dim_num = outputs[GRUCELL_OUTPUT_OUTPUT]->attr.dim_num; memcpy( outputs[GRUCELL_OUTPUT_H_STATE]->attr.size, outputs[GRUCELL_OUTPUT_OUTPUT]->attr.size, - VSI_NN_MAX_DIM_NUM * sizeof( uint32_t ) ); + VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) ); } return TRUE; @@ -373,7 +373,7 @@ static vsi_bool op_setup_float_cudnn vsi_nn_internal_tensor_t** splited_recurrent_fc_output_tensors = NULL; uint32_t kernel_h = 1, kernel_w = 1; grucell_activation_input_layout_e grucell_activation_input_layout = GRUCELL_ACTIVATION_INPUT_LAYOUT_ALL_NC; - uint32_t reshaped_size[2] = { 0 }; + vsi_size_t reshaped_size[2] = { 0 }; p->local->multi_batch = inputs[GRUCELL_INPUT_INPUT]->attr.size[1] > 1; @@ -682,8 +682,8 @@ static vsi_bool op_setup_float_cudnn_v2 dtype.vx_type = VSI_NN_TYPE_FLOAT16; } { - uint32_t _slices[] = { inputs[GRUCELL_INPUT_INPUT]->attr.size[0], - inputs[GRUCELL_INPUT_H_STATE]->attr.size[0] }; + uint32_t _slices[] = { (uint32_t)inputs[GRUCELL_INPUT_INPUT]->attr.size[0], + (uint32_t)inputs[GRUCELL_INPUT_H_STATE]->attr.size[0] }; splited_input_fc_output_tensors = vsi_nn_create_split(self, concated_input->t, 0, 2, _slices, use_virtual_tensor); } @@ -859,7 +859,7 @@ static vsi_bool op_setup_default { /* reshape and transpose input */ vsi_nn_rnn_find_best_kernel_size(p->local->multi_batch, - inputs[GRUCELL_INPUT_INPUT]->attr.size[0], &kernel_h, &kernel_w); + (uint32_t)inputs[GRUCELL_INPUT_INPUT]->attr.size[0], &kernel_h, &kernel_w); input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[GRUCELL_INPUT_INPUT], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); @@ -895,7 +895,7 @@ static vsi_bool op_setup_default { /* reshape and transpose input */ vsi_nn_rnn_find_best_kernel_size(p->local->multi_batch, - inputs[GRUCELL_INPUT_H_STATE]->attr.size[0], &kernel_h, &kernel_w); + (uint32_t)inputs[GRUCELL_INPUT_H_STATE]->attr.size[0], &kernel_h, &kernel_w); hstate_input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[GRUCELL_INPUT_H_STATE], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); @@ -1001,7 +1001,7 @@ static vsi_bool op_setup_default vsi_nn_internal_tensor_t* tmp = NULL; /* reshape and transpose input */ vsi_nn_rnn_find_best_kernel_size(p->local->multi_batch, - inputs[GRUCELL_INPUT_INPUT]->attr.size[0], &kernel_h, &kernel_w); + (uint32_t)inputs[GRUCELL_INPUT_INPUT]->attr.size[0], &kernel_h, &kernel_w); input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[GRUCELL_INPUT_INPUT], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); tmp = vsi_nn_rnn_create_nn_fc(self, @@ -1058,7 +1058,7 @@ static vsi_bool op_setup_default vsi_nn_internal_tensor_t* tmp = NULL; /* reshape and transpose input */ vsi_nn_rnn_find_best_kernel_size(p->local->multi_batch, - rh_mul_outputs->t->attr.size[0], &kernel_h, &kernel_w); + (uint32_t)rh_mul_outputs->t->attr.size[0], &kernel_h, &kernel_w); hstate_input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, rh_mul_outputs->t, p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); tmp = vsi_nn_rnn_create_nn_fc(self, @@ -1108,7 +1108,7 @@ static vsi_bool op_setup_default input_tensor = vsi_nn_internal_new_tensor(self, &attr, 1.0f); memset( &attr, 0x00, sizeof(attr) ); - //memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + //memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); attr.dim_num = VSI_NN_DIM_AUTO; attr.vtl = use_virtual_tensor; attr.is_const = FALSE; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c index 8f54a50..ce7290d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_instancenormalize.c @@ -96,8 +96,8 @@ static vsi_status op_compute vsi_nn_kernel_param_t * param = NULL; vsi_nn_kernel_node_t n = NULL; float eps = self->nn_param.instancenorm.eps; - uint32_t *input_size = inputs[0]->attr.size; - uint32_t dims_num = inputs[0]->attr.dim_num; + vsi_size_t *input_size = inputs[0]->attr.size; + vsi_size_t dims_num = inputs[0]->attr.dim_num; int32_t rs_flg = 0; vsi_nn_tensor_t * tmp_inputs[3] = {NULL, NULL, NULL}; vsi_nn_tensor_t * tmp_outputs[1] = {NULL}; @@ -159,7 +159,7 @@ static vsi_status op_optimize { uint32_t dim = 0; vsi_nn_instancenorm_lcl_data2 *local = NULL; - uint32_t shape[VSI_NN_MAX_DIM_NUM]; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM]; char tensor_name[128]; dim = inputs[0]->attr.dim_num; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c b/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c index e11ba4e..9a2043e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_interp.c @@ -84,14 +84,14 @@ static vsi_bool op_setup ) { vsi_nn_interp_param *p = NULL; - int32_t height_in_eff_, width_in_eff_; - int32_t height_out, width_out; + vsi_ssize_t height_in_eff_, width_in_eff_; + vsi_ssize_t height_out, width_out; vsi_nn_internal_node_t* curr = NULL; vsi_nn_internal_tensor_t *crop_tensor = NULL; vsi_nn_tensor_t *crop_in_tensor = NULL; float factor = 1.0f; - int32_t pad_beg = 0; - int32_t pad_end = 0; + vsi_ssize_t pad_beg = 0; + vsi_ssize_t pad_end = 0; if ( NULL == self ) { @@ -108,7 +108,7 @@ static vsi_bool op_setup { outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; memcpy( outputs[0]->attr.size, inputs[0]->attr.size, - VSI_NN_MAX_DIM_NUM * sizeof( uint32_t ) ); + VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) ); if ((p->shrink_factor > 0) && (p->zoom_factor <= 0)) { width_out = (width_in_eff_ - 1) / p->shrink_factor + 1; @@ -158,10 +158,10 @@ static vsi_bool op_setup if ((pad_beg > 0) || (pad_end > 0)) { vsi_nn_tensor_attr_t attr; - int32_t use_virtual_tensor = 1; - int32_t *begin_dims; - int32_t *end_dims; - int32_t *stride_dims; + vsi_bool use_virtual_tensor = 1; + vsi_ssize_t *begin_dims; + vsi_ssize_t *end_dims; + vsi_ssize_t *stride_dims; uint32_t i; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); vsi_nn_internal_init_tensor_attr(&attr, &inputs[0]->attr.dtype, use_virtual_tensor); @@ -175,12 +175,12 @@ static vsi_bool op_setup curr->node->nn_param.strided_slice.end_mask = 0; curr->node->nn_param.strided_slice.shrink_axis_mask = 0; curr->node->nn_param.strided_slice.new_axis_mask = 0; - begin_dims = (int32_t *)vsi_nn_internal_new_node_param(curr, - VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); - end_dims = (int32_t *)vsi_nn_internal_new_node_param(curr, - VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); - stride_dims = (int32_t *)vsi_nn_internal_new_node_param(curr, - VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + begin_dims = (vsi_ssize_t *)vsi_nn_internal_new_node_param(curr, + VSI_NN_MAX_DIM_NUM * sizeof(vsi_ssize_t)); + end_dims = (vsi_ssize_t *)vsi_nn_internal_new_node_param(curr, + VSI_NN_MAX_DIM_NUM * sizeof(vsi_ssize_t)); + stride_dims = (vsi_ssize_t *)vsi_nn_internal_new_node_param(curr, + VSI_NN_MAX_DIM_NUM * sizeof(vsi_ssize_t)); for (i = 0; i < inputs[0]->attr.dim_num; i++) { stride_dims[i] = 1; @@ -199,9 +199,9 @@ static vsi_bool op_setup end_dims[i] = inputs[0]->attr.size[i]; } } - curr->node->nn_param.strided_slice.begin_dims = begin_dims; - curr->node->nn_param.strided_slice.end_dims = end_dims; - curr->node->nn_param.strided_slice.stride_dims = stride_dims; + curr->node->nn_param.strided_slice.begin_dims = (int32_t*)begin_dims; + curr->node->nn_param.strided_slice.end_dims = (int32_t*)end_dims; + curr->node->nn_param.strided_slice.stride_dims = (int32_t*)stride_dims; curr->inputs[0] = inputs[0]; curr->outputs[0] = crop_in_tensor; vsi_nn_internal_setup_node(self, curr); @@ -211,7 +211,8 @@ static vsi_bool op_setup crop_in_tensor = inputs[0]; } - if ((width_in_eff_ == (int32_t)outputs[0]->attr.size[0]) && (height_in_eff_ == (int32_t)outputs[0]->attr.size[1])) + if ((width_in_eff_ == (vsi_ssize_t)outputs[0]->attr.size[0]) + && (height_in_eff_ == (vsi_ssize_t)outputs[0]->attr.size[1])) { curr = vsi_nn_internal_new_node( self, VSI_NN_OP_DATACONVERT, 1, 1 ); curr->inputs[0] = crop_in_tensor; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c index b102bfd..8c298ae 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l2normalizescale.c @@ -50,14 +50,14 @@ static vsi_nn_tensor_t* _expand_scale_tensor ( vsi_nn_graph_t *graph, vsi_nn_tensor_t *scale, - int32_t scale_size_in, - int32_t scale_size_out + vsi_size_t scale_size_in, + vsi_size_t scale_size_out ) { vsi_status status = VX_SUCCESS; float* f32_in_buffer = NULL; float* f32_out_buffer = NULL; - int32_t i = 0; + vsi_size_t i = 0; vsi_nn_tensor_attr_t attr; vsi_nn_tensor_t* scale_tensor = NULL; vsi_nn_dtype_t out_dtype; @@ -124,8 +124,8 @@ static vsi_bool _check_value_is_equal_to_one { vsi_bool ret = TRUE; float* tensor_data = NULL; - uint32_t elements = 0; - uint32_t i = 0; + vsi_size_t elements = 0; + vsi_size_t i = 0; elements = vsi_nn_GetElementNum( tensor ); tensor_data = vsi_nn_ConvertTensorToFloat32Data( graph, tensor ); @@ -168,10 +168,10 @@ static vsi_status op_compute uint32_t axis_size = 0; uint32_t rank_in = 0; uint32_t rank_out = 0; - uint32_t size = 1; + vsi_size_t size = 1; uint32_t i = 0; - uint32_t scale_size = 1; - int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + vsi_size_t scale_size = 1; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; vsi_nn_l2normalizescale_param * p = NULL; vsi_bool ret = FALSE; @@ -192,9 +192,9 @@ static vsi_status op_compute param =vsi_nn_kernel_param_create(); ret = vsi_nn_kernel_optimize_reduce_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[0]->attr.size, inputs[0]->attr.dim_num, &axis, 1, - (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + outputs[0]->attr.size, outputs[0]->attr.dim_num, shapes[0], &rank_in, shapes[2], &rank_out, &new_axis, &axis_size); size = inputs[1]->attr.size[0]; @@ -202,7 +202,7 @@ static vsi_status op_compute { size *= inputs[1]->attr.size[i]; } - shapes[1][0] = (int32_t)size; + shapes[1][0] = size; shapes[1][1] = 1; shapes[1][2] = 1; shapes[1][3] = 1; @@ -213,7 +213,7 @@ static vsi_status op_compute if ( ret ) { reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, - inputs[0], (uint32_t*)shapes[0], rank_in ); + inputs[0], shapes[0], rank_in ); if (is_expand_scale) { reshape_tensors[1] = _expand_scale_tensor(self->graph, inputs[1], size, scale_size); @@ -221,10 +221,10 @@ static vsi_status op_compute else { reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, - inputs[1], (uint32_t*)shapes[1], 2 ); + inputs[1], shapes[1], 2 ); } reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, - outputs[0], (uint32_t*)shapes[0], rank_in ); + outputs[0], shapes[0], rank_in ); self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "l2normalizescale", diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c index 7e19c43..fc378ad 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_layernormalize.c @@ -162,6 +162,7 @@ static vsi_bool op_check IO_TYPE(D_U8|Q_ASYM, D_F32, D_F16, D_U8|Q_ASYM) IO_TYPE(D_I16|Q_DFP, D_F32, D_F16, D_I16|Q_DFP) IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32, D_F16) IO_TYPE(D_I16|Q_DFP, D_F32, D_F32, D_I16|Q_DFP) END_IO_TYPE_DECL(LAYER_NORM) if (!VALIDATE_OP_IO_TYPES(LAYER_NORM, self, inputs, self->input.num, outputs, self->output.num)) @@ -217,4 +218,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c index f3f6e38..00030fe 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_log_softmax.c @@ -47,7 +47,7 @@ static vsi_status _log_softmax_op_compute { vsi_status status; vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; - int32_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } }; uint32_t rank_in = 0; int32_t axis = 0; int32_t new_axis = 0; @@ -70,7 +70,7 @@ static vsi_status _log_softmax_op_compute // TODO: This optimzie is a hack for gpu path, // it should be moved to gpu kernel setup. ret = vsi_nn_kernel_optimize_softmax_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, + inputs[0]->attr.size, inputs[0]->attr.dim_num, axis, shapes[0], &rank_in, &new_axis); if( ret ) @@ -82,9 +82,9 @@ static vsi_status _log_softmax_op_compute vsi_nn_kernel_param_add_float32( param, "beta", betaValue ); reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, - inputs[0], (uint32_t*)shapes[0], rank_in ); + inputs[0], shapes[0], rank_in ); reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, - outputs[0], (uint32_t*)shapes[0], rank_in ); + outputs[0], shapes[0], rank_in ); self->n = (vx_node)vsi_nn_kernel_selector( self->graph, kernel_name, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_not.c b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_not.c index aec87a9..a607f70 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_not.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_not.c @@ -49,8 +49,8 @@ static vsi_status op_compute { vsi_status status = VSI_FAILURE; vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; - int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; - int32_t new_rank = 0; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t new_rank = 0; vsi_bool ret; if( NULL == self ) @@ -61,14 +61,14 @@ static vsi_status op_compute // TODO: This optimzie is a hack for gpu path, // it should be moved to gpu kernel setup. ret = vsi_nn_kernel_optimize_element_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank ); if( ret ) { reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, - inputs[0], (uint32_t*)shape, new_rank ); + inputs[0], shape, new_rank ); reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, - outputs[0], (uint32_t*)shape, new_rank ); + outputs[0], shape, new_rank ); self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "logical_not", @@ -116,7 +116,7 @@ static vsi_bool op_setup ) { uint32_t i, out_rank; - uint32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_bool ret = TRUE; out_rank = inputs[0]->attr.dim_num; @@ -128,18 +128,18 @@ static vsi_bool op_setup if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = out_rank; - memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(uint32_t) ); + memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) ); } else { - uint32_t total_size_got; - uint32_t total_size_expected; + vsi_size_t total_size_got; + vsi_size_t total_size_expected; total_size_expected = vsi_nn_ShapeProduct( shape, out_rank ); total_size_got = vsi_nn_ShapeProduct( outputs[0]->attr.size, outputs[0]->attr.dim_num ); if( total_size_expected != total_size_got ) { - VSILOGW("Output size mismatch, expect %d, but got %d", + VSILOGW("Output size mismatch, expect %"VSI_SIZE_T_SPECIFIER", but got %"VSI_SIZE_T_SPECIFIER"", total_size_expected, total_size_got); ret = FALSE; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c index 4ea6537..bff1972 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_logical_ops.c @@ -50,8 +50,8 @@ static vsi_status op_compute vsi_status status = VSI_FAILURE; vsi_nn_kernel_param_t * param = NULL; vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; - int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; - uint32_t new_rank = 0; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + vsi_size_t new_rank = 0; vsi_bool ret; if( NULL == self ) @@ -60,9 +60,9 @@ static vsi_status op_compute } ret = vsi_nn_kernel_optimize_eltwise_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, - (int32_t *)inputs[1]->attr.size, inputs[1]->attr.dim_num, - (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[1]->attr.size, inputs[1]->attr.dim_num, + outputs[0]->attr.size, outputs[0]->attr.dim_num, shapes[0], shapes[1], shapes[2], &new_rank ); if( ret ) @@ -71,11 +71,11 @@ static vsi_status op_compute vsi_nn_kernel_param_add_int32( param, "ops_type", self->nn_param.logical_ops.op ); reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, - inputs[0], (uint32_t*)shapes[0], new_rank ); + inputs[0], shapes[0], new_rank ); reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, - inputs[1], (uint32_t*)shapes[1], new_rank ); + inputs[1], shapes[1], new_rank ); reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, - outputs[0], (uint32_t*)shapes[2], new_rank ); + outputs[0], shapes[2], new_rank ); if (shapes[1][3] > shapes[0][3] && new_rank == 4) { @@ -133,7 +133,7 @@ static vsi_bool op_setup ) { uint32_t i, out_rank, in1_rank, in2_rank; - uint32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_bool ret = TRUE; in1_rank = inputs[0]->attr.dim_num; @@ -142,7 +142,7 @@ static vsi_bool op_setup for(i = 0; i < out_rank; i++) { - uint32_t sz0, sz1; + vsi_size_t sz0, sz1; sz0 = i < in1_rank ? inputs[0]->attr.size[i] : 1; sz1 = i < in2_rank ? inputs[1]->attr.size[i] : 1; shape[i] = vsi_nn_max( sz0, sz1 ); @@ -151,18 +151,18 @@ static vsi_bool op_setup if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { outputs[0]->attr.dim_num = out_rank; - memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(uint32_t) ); + memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) ); } else { - uint32_t total_size_got; - uint32_t total_size_expected; + vsi_size_t total_size_got; + vsi_size_t total_size_expected; total_size_expected = vsi_nn_ShapeProduct( shape, out_rank ); total_size_got = vsi_nn_ShapeProduct( outputs[0]->attr.size, outputs[0]->attr.dim_num ); if( total_size_expected != total_size_got ) { - VSILOGW("Output size mismatch, expect %d, but got %d", + VSILOGW("Output size mismatch, expect %"VSI_SIZE_T_SPECIFIER", but got %"VSI_SIZE_T_SPECIFIER"", total_size_expected, total_size_got); ret = FALSE; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lrn2.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lrn2.c index 9c4c9e8..ee65331 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lrn2.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lrn2.c @@ -43,10 +43,10 @@ static vsi_status op_compute { vsi_status status = VSI_FAILURE; int32_t axis = -1; - uint32_t sizes[VSI_NN_MAX_DIM_NUM] = {1}; - uint32_t innerSize = 1; - uint32_t outerSize = 1; - uint32_t axisSize = 1; + vsi_size_t sizes[VSI_NN_MAX_DIM_NUM] = {1}; + vsi_size_t innerSize = 1; + vsi_size_t outerSize = 1; + vsi_size_t axisSize = 1; vx_tensor vx_input = NULL; vx_tensor vx_output = NULL; vx_tensor input = inputs[0]->t; @@ -86,8 +86,13 @@ static vsi_status op_compute if(outerSize < MAX_BATCH_COUNT) { - vx_input = vxReshapeTensor(inputs[0]->t, (int32_t *)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4)); - vx_output = vxReshapeTensor(outputs[0]->t, (int32_t *)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4)); +#ifdef VSI_40BIT_VA_SUPPORT + vx_input = vxReshapeTensor(inputs[0]->t, sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4)); + vx_output = vxReshapeTensor(outputs[0]->t, sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4)); +#else + vx_input = vxReshapeTensor(inputs[0]->t, (int32_t*)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4)); + vx_output = vxReshapeTensor(outputs[0]->t, (int32_t*)sizes, vsi_nn_max(inputs[0]->attr.dim_num, 4)); +#endif input = vx_input; output = vx_output; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lsh_projection.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lsh_projection.c index 96086dc..63a85f7 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lsh_projection.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lsh_projection.c @@ -63,9 +63,9 @@ static vsi_status op_compute { vsi_nn_tensor_attr_t attr; float const_one = 1.0; - int32_t i; - int32_t count = inputs[1]->attr.size[1]; - float* const_data = malloc(count * sizeof(float)); + vsi_size_t i; + vsi_size_t count = inputs[1]->attr.size[1]; + float* const_data = (float*)malloc(count * sizeof(float)); for (i = 0; i < count; i++) { @@ -122,7 +122,7 @@ static vsi_bool op_setup } else if( VSI_NN_LSH_PROJECTION_DENSE == node->nn_param.lsh_projection.type ) { - outputs[0]->attr.size[0] = vsi_nn_GetElementNum( inputs[0] ); + outputs[0]->attr.size[0] = (uint32_t)vsi_nn_GetElementNum( inputs[0] ); } else { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c index 5910a92..283f930 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstm_ovxlib.c @@ -49,9 +49,9 @@ static vsi_bool setup_op_shapes vsi_nn_lstm_ovxlib_param* curr_param = &self->nn_param.lstm_ovxlib; vsi_nn_tensor_attr_t attr; vsi_nn_internal_tensor_t* output_tensor = NULL; - uint32_t num_units = 0; - uint32_t output_size = 0; - uint32_t batch_size = 0; + vsi_size_t num_units = 0; + vsi_size_t output_size = 0; + vsi_size_t batch_size = 0; vsi_bool use_virtual_tensor = TRUE; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); @@ -101,20 +101,23 @@ static vsi_bool setup_op_shapes if( !outputs[LSTM_OUTPUT_H_STATE] ) { - memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); attr.dim_num = VSI_NN_DIM_AUTO; memcpy( &attr.dtype, &outputs[LSTM_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) ); attr.vtl = use_virtual_tensor; + attr.is_const = FALSE; output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); outputs[LSTM_OUTPUT_H_STATE] = output_tensor->t; } if( !outputs[LSTM_OUTPUT_C_STATE] ) { - memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); attr.dim_num = VSI_NN_DIM_AUTO; - memcpy( &attr.dtype, &inputs[LSTM_INPUT_C_STATE]->attr.dtype, sizeof( attr.dtype ) ); + attr.dtype.qnt_type = VSI_NN_QNT_TYPE_NONE; + attr.dtype.vx_type = VSI_NN_TYPE_FLOAT16; attr.vtl = use_virtual_tensor; + attr.is_const = FALSE; output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); outputs[LSTM_OUTPUT_C_STATE] = output_tensor->t; } @@ -214,13 +217,13 @@ static vsi_bool op_setup if( curr_param->time_major ) { - batch_size = inputs[LSTM_INPUT_INPUT]->attr.size[1]; - time_step = inputs[LSTM_INPUT_INPUT]->attr.size[2]; + batch_size = (uint32_t)inputs[LSTM_INPUT_INPUT]->attr.size[1]; + time_step = (uint32_t)inputs[LSTM_INPUT_INPUT]->attr.size[2]; } else { - batch_size = inputs[LSTM_INPUT_INPUT]->attr.size[2]; - time_step = inputs[LSTM_INPUT_INPUT]->attr.size[1]; + batch_size = (uint32_t)inputs[LSTM_INPUT_INPUT]->attr.size[2]; + time_step = (uint32_t)inputs[LSTM_INPUT_INPUT]->attr.size[1]; } setup_op_shapes( self, inputs, outputs); @@ -237,11 +240,11 @@ static vsi_bool op_setup /* split input tensor */ split_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * \ - sizeof(vsi_nn_tensor_t **)); - memset( split_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + sizeof(vsi_nn_tensor_t *)); + memset( split_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t *)); lstmunit_reshape_output_tensors = (vsi_nn_tensor_t **)malloc(time_step * \ - sizeof(vsi_nn_tensor_t **)); - memset( lstmunit_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); + sizeof(vsi_nn_tensor_t *)); + memset( lstmunit_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t *)); vsi_nn_rnn_split_input_tensor(self, input_tensor, split_output_tensors, time_step, use_virtual_tensor); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit.c index 232bebf..7730fee 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit.c @@ -271,7 +271,7 @@ static vsi_bool op_setup { outputs[1]->attr.dim_num = outputs[0]->attr.dim_num; memcpy( outputs[1]->attr.size, outputs[0]->attr.size, - VSI_NN_MAX_DIM_NUM * sizeof( uint32_t ) ); + VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) ); } /* cell_state_out */ diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c index 8e2f1e6..d5d5123 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_lstmunit_ovxlib.c @@ -64,7 +64,7 @@ static vsi_nn_internal_tensor_t* create_tp_fc if( !bias || p->local->use_layer_norm || p->local->use_hybrid ) { /* create zero bias for NN/TP */ - tensor1 = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr, FALSE); + tensor1 = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr, VSI_NN_OP_FCL, FALSE); tensor = tensor1->t; } @@ -73,7 +73,7 @@ static vsi_nn_internal_tensor_t* create_tp_fc tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_FCL, 0, 0 ); tmp_inode->node->nn_param.fcl.axis = 0; - tmp_inode->node->nn_param.fcl.weights = weight->attr.size[1]; + tmp_inode->node->nn_param.fcl.weights = (uint32_t)weight->attr.size[1]; tmp_inode->inputs[0] = input; tmp_inode->inputs[1] = weight; @@ -102,7 +102,7 @@ static vsi_nn_internal_tensor_t* create_nn_fc vsi_nn_internal_tensor_t* tensor1 = NULL; vsi_nn_internal_tensor_t* tensor2 = NULL; vsi_nn_internal_tensor_t* reshaped_weight_tensor = NULL; - uint32_t reshaped_weight_shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t reshaped_weight_shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_nn_internal_node_t* tmp_inode = NULL; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); @@ -110,7 +110,8 @@ static vsi_nn_internal_tensor_t* create_nn_fc if( !bias || p->local->use_layer_norm || p->local->use_hybrid ) { /* create zero bias for NN/TP */ - tensor1 = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr, FALSE); + tensor1 = vsi_nn_internal_create_zero_bias_tensor( + self, &input->attr, &weight->attr, VSI_NN_OP_FCL, FALSE); tensor = tensor1->t; } @@ -122,7 +123,7 @@ static vsi_nn_internal_tensor_t* create_nn_fc reshaped_weight_shape[1] = kernel_h; reshaped_weight_shape[0] = kernel_w; - memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); attr.dim_num = VSI_NN_DIM_AUTO; attr.vtl = weight->attr.vtl; attr.is_const = FALSE; //weight->attr.is_const; @@ -144,7 +145,7 @@ static vsi_nn_internal_tensor_t* create_nn_fc tmp_inode->node->nn_param.conv2d.group = 1; tmp_inode->node->nn_param.conv2d.dilation[0] = 1; tmp_inode->node->nn_param.conv2d.dilation[1] = 1; - tmp_inode->node->nn_param.conv2d.weights = weight->attr.size[1]; + tmp_inode->node->nn_param.conv2d.weights = (uint32_t)weight->attr.size[1]; tmp_inode->inputs[0] = input; tmp_inode->inputs[1] = reshaped_weight_tensor->t; @@ -227,7 +228,7 @@ static vsi_bool setup_op_shapes { outputs[LSTMUNIT_OUTPUT_H_STATE]->attr.dim_num = outputs[LSTMUNIT_OUTPUT_OUTPUT]->attr.dim_num; memcpy( outputs[LSTMUNIT_OUTPUT_H_STATE]->attr.size, outputs[LSTMUNIT_OUTPUT_OUTPUT]->attr.size, - VSI_NN_MAX_DIM_NUM * sizeof( uint32_t ) ); + VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) ); } /* cell_state_out */ @@ -392,7 +393,7 @@ static vsi_bool op_setup { /* reshape and transpose input */ vsi_nn_rnn_find_best_kernel_size(p->local->multi_batch, - inputs[LSTMUNIT_INPUT_INPUT]->attr.size[0], &kernel_h, &kernel_w); + (uint32_t)inputs[LSTMUNIT_INPUT_INPUT]->attr.size[0], &kernel_h, &kernel_w); input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[LSTMUNIT_INPUT_INPUT], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); @@ -413,7 +414,7 @@ static vsi_bool op_setup { /* reshape and transpose input */ vsi_nn_rnn_find_best_kernel_size(p->local->multi_batch, - inputs[LSTMUNIT_INPUT_AUX_INPUT]->attr.size[0], &kernel_h, &kernel_w); + (uint32_t)inputs[LSTMUNIT_INPUT_AUX_INPUT]->attr.size[0], &kernel_h, &kernel_w); input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[LSTMUNIT_INPUT_AUX_INPUT], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); @@ -463,7 +464,7 @@ static vsi_bool op_setup { /* reshape and transpose input */ vsi_nn_rnn_find_best_kernel_size(p->local->multi_batch, - inputs[LSTMUNIT_INPUT_H_STATE]->attr.size[0], &kernel_h, &kernel_w); + (uint32_t)inputs[LSTMUNIT_INPUT_H_STATE]->attr.size[0], &kernel_h, &kernel_w); recurrent_input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[LSTMUNIT_INPUT_H_STATE], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); @@ -508,7 +509,7 @@ static vsi_bool op_setup { for( i = ifco_start_index; i < LSTMUNIT_IFCO_GATE_COUNT; i++ ) { - memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); attr.dim_num = VSI_NN_DIM_AUTO; attr.vtl = use_virtual_tensor; attr.is_const = FALSE; @@ -566,7 +567,7 @@ static vsi_bool op_setup if( p->local->use_projection ) { /* create virtual tensor for activations' output0 */ - memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); attr.dim_num = VSI_NN_DIM_AUTO; attr.vtl = use_virtual_tensor; @@ -607,7 +608,7 @@ static vsi_bool op_setup { vsi_bool use_virtual_tensor = inputs[LSTMUNIT_INPUT_BIAS_PROJ]->attr.vtl; input_tensor = vsi_nn_internal_create_zero_bias_tensor(self, &output_tensor->t->attr, - &inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr, FALSE); + &inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr, VSI_NN_OP_FCL, FALSE); zero_bias_tensor = input_tensor->t; if (use_virtual_tensor) @@ -626,7 +627,7 @@ static vsi_bool op_setup else if ( p->local->use_hybrid || !p->local->use_projection_bias ) { input_tensor = vsi_nn_internal_create_zero_bias_tensor(self, &output_tensor->t->attr, - &inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr, FALSE); + &inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr, VSI_NN_OP_FCL, FALSE); zero_bias_tensor = input_tensor->t; } else @@ -636,7 +637,7 @@ static vsi_bool op_setup curr = vsi_nn_internal_new_node( self, VSI_NN_OP_FCL, 0, 0 ); curr->node->nn_param.fcl.axis = 0; - curr->node->nn_param.fcl.weights = inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr.size[1]; + curr->node->nn_param.fcl.weights = (uint32_t)inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]->attr.size[1]; curr->inputs[0] = output_tensor->t; curr->inputs[1] = inputs[LSTMUNIT_INPUT_WEIGHT_PROJ]; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c b/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c index 2a5a5db..bd24f7d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_moments.c @@ -110,13 +110,17 @@ static vsi_bool op_check int32_t i = 0; BEGIN_IO_TYPE_DECL(MOMENTS, 1, 2) - IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) - IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) - IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F32, D_F32) + IO_TYPE(D_I8|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I8|Q_DFP, D_F32, D_F32) + IO_TYPE(D_I16|Q_DFP, D_F16, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F32, D_F32) IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_F16, D_F16) - IO_TYPE(D_F32, D_F32, D_F32) - IO_TYPE(D_I32, D_F32, D_F32) + IO_TYPE(D_F16, D_F16, D_F16) + IO_TYPE(D_F16, D_F32, D_F32) + IO_TYPE(D_F32, D_F32, D_F32) + IO_TYPE(D_I32, D_F32, D_F32) END_IO_TYPE_DECL(MOMENTS) if (!VALIDATE_OP_IO_TYPES(MOMENTS, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c index a1a825a..a5ce4f9 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_permute.c @@ -169,6 +169,7 @@ static vsi_bool op_check IO_TYPE(D_F32, D_F16) IO_TYPE(D_BF16, D_F32) IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_I32, D_I32) END_IO_TYPE_DECL(PERMUTE) if (!VALIDATE_OP_IO_TYPES(PERMUTE, self, inputs, self->input.num, outputs, self->output.num)) { @@ -230,7 +231,7 @@ static vsi_status op_optimize ) { vsi_status status; - uint32_t shape[VSI_NN_MAX_DIM_NUM]; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM]; uint32_t i = 0; status = VSI_SUCCESS; @@ -253,8 +254,13 @@ static vsi_status op_optimize { if(NULL == inputs[0]->t && NULL != outputs[0]->t) { +#ifdef VSI_40BIT_VA_SUPPORT inputs[0]->t = vxReshapeTensor( outputs[0]->t, - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num ); + inputs[0]->attr.size, inputs[0]->attr.dim_num ); +#else + inputs[0]->t = vxReshapeTensor( outputs[0]->t, + (int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num ); +#endif if( inputs[0]->t == NULL ) { status = VSI_FAILURE; @@ -268,7 +274,7 @@ static vsi_status op_optimize { vsi_bool ret; ret = vsi_nn_ReshapeTensor( self->graph, inputs[0], outputs[0], - shape, self->nn_param.permute.dim_num ); + shape, (vsi_size_t)self->nn_param.permute.dim_num ); if( ret == FALSE ) { status = VSI_FAILURE; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c index d90d7a2..67e2113 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pool.c @@ -137,7 +137,7 @@ static vsi_status op_optimize { uint32_t dim = 0; vsi_nn_pool_lcl_data *local = NULL; - uint32_t shape[VSI_NN_MAX_DIM_NUM]; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM]; char tensor_name[128]; dim = inputs[0]->attr.dim_num; @@ -288,19 +288,37 @@ static vsi_bool op_setup ) { vsi_bool ret; + vsi_size_t ksize[_cnt_of_array(self->nn_param.pool.ksize)], i; + vsi_size_t pad[_cnt_of_array(self->nn_param.pool.pad)] = {0}; ret = TRUE; + for(i = 0; i < _cnt_of_array(self->nn_param.pool.ksize); i++) + { + ksize[i] = self->nn_param.pool.ksize[i]; + } + for(i = 0; i < _cnt_of_array(self->nn_param.pool.pad); i++) + { + pad[i] = self->nn_param.pool.pad[i]; + } if(_is_pool1d(self, inputs)) { vsi_nn_compute_padding_conv1d( inputs[0]->attr.size, - self->nn_param.pool.ksize, + ksize, self->nn_param.pool.stride, NULL, self->nn_param.pool.pad_type, - self->nn_param.pool.pad + pad ); + for(i = 0; i < _cnt_of_array(self->nn_param.pool.ksize); i++) + { + self->nn_param.pool.ksize[i] = (uint32_t)ksize[i]; + } + for(i = 0; i < _cnt_of_array(self->nn_param.pool.pad); i++) + { + self->nn_param.pool.pad[i] = (uint32_t)pad[i]; + } /* Pooling */ outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize @@ -320,12 +338,20 @@ static vsi_bool op_setup { vsi_nn_compute_padding( inputs[0]->attr.size, - self->nn_param.pool.ksize, + ksize, self->nn_param.pool.stride, NULL, self->nn_param.pool.pad_type, - self->nn_param.pool.pad + pad ); + for(i = 0; i < _cnt_of_array(self->nn_param.pool.ksize); i++) + { + self->nn_param.pool.ksize[i] = (uint32_t)ksize[i]; + } + for(i = 0; i < _cnt_of_array(self->nn_param.pool.pad); i++) + { + self->nn_param.pool.pad[i] = (uint32_t)pad[i]; + } /* Pooling */ outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize @@ -357,7 +383,7 @@ static vsi_bool op_setup { outputs[1]->attr.dim_num = outputs[0]->attr.dim_num; memcpy( outputs[1]->attr.size, outputs[0]->attr.size, - VSI_NN_MAX_DIM_NUM * sizeof( uint32_t ) ); + VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) ); } return ret; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c index bc8c3de..88edb90 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_poolwithargmax.c @@ -44,10 +44,10 @@ static vsi_bool vsi_nn_poolwithargmax_optimize_shape ( vsi_nn_node_t * self, - const int32_t* shape_in, const int32_t* shape_out0, - const int32_t* shape_out1, const size_t rank_in, - int32_t* out_shape_input, int32_t* out_shape_output0, - int32_t* out_shape_output1, uint32_t* out_rank_output + const vsi_ssize_t* shape_in, const vsi_ssize_t* shape_out0, + const vsi_ssize_t* shape_out1, const size_t rank_in, + vsi_ssize_t* out_shape_input, vsi_ssize_t* out_shape_output0, + vsi_ssize_t* out_shape_output1, uint32_t* out_rank_output ) { vsi_bool enable_image_2d = FALSE; @@ -134,7 +134,7 @@ static vsi_status op_compute { vsi_status status = VSI_FAILURE; vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; - int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; uint32_t new_rank = 0; vsi_bool ret; vsi_nn_kernel_param_t * param = NULL; @@ -153,9 +153,9 @@ static vsi_status op_compute param =vsi_nn_kernel_param_create(); ret = vsi_nn_poolwithargmax_optimize_shape(self, - (int32_t *)inputs[0]->attr.size, (int32_t *)outputs[0]->attr.size, - (int32_t *)outputs[1]->attr.size, inputs[0]->attr.dim_num, - shapes[0], shapes[1], shapes[2], &new_rank ); + (vsi_ssize_t*)inputs[0]->attr.size, (vsi_ssize_t*)outputs[0]->attr.size, + (vsi_ssize_t*)outputs[1]->attr.size, inputs[0]->attr.dim_num, + (vsi_ssize_t*)shapes[0], (vsi_ssize_t*)shapes[1], (vsi_ssize_t*)shapes[2], &new_rank ); vsi_nn_kernel_param_add_int32( param, "ksize_x", ksize_x ); vsi_nn_kernel_param_add_int32( param, "ksize_y", ksize_y ); @@ -168,11 +168,11 @@ static vsi_status op_compute { reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, - inputs[0], (uint32_t*)shapes[0], new_rank ); + inputs[0], shapes[0], new_rank ); reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, - outputs[0], (uint32_t*)shapes[1], new_rank ); + outputs[0], shapes[1], new_rank ); reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, - outputs[1], (uint32_t*)shapes[2], new_rank ); + outputs[1], shapes[2], new_rank ); self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "poolwithargmax", &reshape_tensors[0], _INPUT_NUM, &reshape_tensors[1], _OUTPUT_NUM, param ); @@ -244,15 +244,34 @@ static vsi_bool op_setup ) { vsi_bool ret = TRUE; + vsi_size_t ksize[_cnt_of_array(self->nn_param.pool.ksize)] = {0}; + vsi_size_t i = 0; + vsi_size_t pad[_cnt_of_array(self->nn_param.pool.pad)] = {0}; + for(i = 0; i < _cnt_of_array(self->nn_param.pool.ksize); i++) + { + ksize[i] = self->nn_param.pool.ksize[i]; + } + for(i = 0; i < _cnt_of_array(self->nn_param.pool.pad); i++) + { + pad[i] = self->nn_param.pool.pad[i]; + } vsi_nn_compute_padding( inputs[0]->attr.size, - self->nn_param.pool.ksize, + ksize, self->nn_param.pool.stride, NULL, self->nn_param.pool.pad_type, - self->nn_param.pool.pad + pad ); + for(i = 0; i < _cnt_of_array(self->nn_param.pool.ksize); i++) + { + self->nn_param.pool.ksize[i] = (uint32_t)ksize[i]; + } + for(i = 0; i < _cnt_of_array(self->nn_param.pool.pad); i++) + { + self->nn_param.pool.pad[i] = (uint32_t)pad[i]; + } if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c index 473b900..9f0a995 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c @@ -122,9 +122,9 @@ static vsi_bool op_setup attr.size[0] = p->output_attr.size[1]; attr.size[1] = p->output_attr.size[2]; attr.size[2] = p->output_attr.size[0]; - p->output_attr.size[0] = attr.size[0]; - p->output_attr.size[1] = attr.size[1]; - p->output_attr.size[2] = attr.size[2]; + p->output_attr.size[0] = (uint32_t)attr.size[0]; + p->output_attr.size[1] = (uint32_t)attr.size[1]; + p->output_attr.size[2] = (uint32_t)attr.size[2]; attr.vtl = use_virtual_tensor; attr.is_const = FALSE; @@ -301,6 +301,7 @@ static vsi_bool op_setup vsi_nn_internal_tensor_t* tmp_outputs[3] = { NULL }; vsi_nn_tensor_attr_t attr; float mean[3] = {0}; + vsi_size_t size_32bit[VSI_NN_MAX_DIM_NUM] = {0}; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); @@ -312,13 +313,20 @@ static vsi_bool op_setup } memcpy(&attr, &outputs[0]->attr, sizeof(vsi_nn_tensor_attr_t)); - memcpy(&attr.size, p->output_attr.size, p->output_attr.dim_num * sizeof(uint32_t)); + for(i = 0; i < p->output_attr.dim_num; i++) + { + attr.size[i] = (vsi_size_t)p->output_attr.size[i]; + } attr.size[axis] = 1; attr.vtl = TRUE; attr.is_const = FALSE; output_tensor_group[0] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); output_tensor_group[1] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); output_tensor_group[2] = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); + for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + size_32bit[i] = attr.size[i]; + } if (p->reverse_channel) { @@ -350,7 +358,7 @@ static vsi_bool op_setup curr->node->nn_param.pre_process_gray.rect.top = p->rect.top; curr->node->nn_param.pre_process_gray.rect.width = p->rect.width; curr->node->nn_param.pre_process_gray.rect.height = p->rect.height; - curr->node->nn_param.pre_process_gray.output_attr.size = attr.size; + curr->node->nn_param.pre_process_gray.output_attr.size = size_32bit; curr->node->nn_param.pre_process_gray.output_attr.dim_num = p->output_attr.dim_num; curr->inputs[0] = input_tensor_group[i]; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c index 10b7260..c1be239 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_bgra.c @@ -155,8 +155,8 @@ static vsi_bool op_setup } } - p->local.scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0]; - p->local.scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1]; + p->local.scale_x = (int32_t)((p->rect.width << 15) / outputs[0]->attr.size[0]); + p->local.scale_y = (int32_t)((p->rect.height << 15) / outputs[0]->attr.size[1]); p->local.enable_copy = ((p->local.scale_x == p->local.scale_y) && (p->local.scale_x == (1 << 15))); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c index ebebc54..31818ee 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_gray.c @@ -157,8 +157,8 @@ static vsi_bool op_setup } } - p->local.scale_x = (p->rect.width << 15) / outputs[PRE_PROCESS_GRAY_OUTPUT]->attr.size[0]; - p->local.scale_y = (p->rect.height << 15) / outputs[PRE_PROCESS_GRAY_OUTPUT]->attr.size[1]; + p->local.scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_GRAY_OUTPUT]->attr.size[0]); + p->local.scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_GRAY_OUTPUT]->attr.size[1]); p->local.enable_copy = ((p->local.scale_x == p->local.scale_y) && (p->local.scale_x == (1 << 15))); return TRUE; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c index f11ed8e..176aabf 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_nv12.c @@ -156,8 +156,8 @@ static vsi_bool op_setup } } - p->local->scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0]; - p->local->scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1]; + p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[0]->attr.size[0]); + p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[0]->attr.size[1]); p->local->enable_copy = ((p->local->scale_x == p->local->scale_y) && (p->local->scale_x == (1 << 15))); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c index a8ce9be..b7f4f1d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_rgb.c @@ -166,13 +166,13 @@ static vsi_bool op_setup if (self->nn_param.pre_process_rgb.local.enable_perm == FALSE) { - p->local.scale_x = (p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[0]; - p->local.scale_y = (p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]; + p->local.scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[0]); + p->local.scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]); } else { - p->local.scale_x = (p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]; - p->local.scale_y = (p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[2]; + p->local.scale_x = (int32_t)((p->rect.width << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[1]); + p->local.scale_y = (int32_t)((p->rect.height << 15) / outputs[PRE_PROCESS_RGB_OUTPUT]->attr.size[2]); } p->local.enable_copy = ((p->local.scale_x == p->local.scale_y) && (p->local.scale_x == (1 << 15))); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c index 4a7eb22..d98910b 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv420.c @@ -157,8 +157,8 @@ static vsi_bool op_setup } } - p->local.scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0]; - p->local.scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1]; + p->local.scale_x = (int32_t)((p->rect.width << 15) / outputs[0]->attr.size[0]); + p->local.scale_y = (int32_t)((p->rect.height << 15) / outputs[0]->attr.size[1]); p->local.enable_copy = ((p->local.scale_x == p->local.scale_y) && (p->local.scale_x == (1 << 15))); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c index 296e245..6a350d1 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process_yuv444.c @@ -157,8 +157,8 @@ static vsi_bool op_setup } } - p->local->scale_x = (p->rect.width << 15) / outputs[0]->attr.size[0]; - p->local->scale_y = (p->rect.height << 15) / outputs[0]->attr.size[1]; + p->local->scale_x = (int32_t)((p->rect.width << 15) / outputs[0]->attr.size[0]); + p->local->scale_y = (int32_t)((p->rect.height << 15) / outputs[0]->attr.size[1]); p->local->enable_copy = ((p->local->scale_x == p->local->scale_y) && (p->local->scale_x == (1 << 15))); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c index 5464b4c..eacf99d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_prelu.c @@ -43,7 +43,7 @@ static vsi_bool _is_one_rank_tensor ( vsi_nn_tensor_t * input, - uint32_t *shape + vsi_size_t *shape ) { uint32_t i = 0; @@ -78,11 +78,11 @@ static vsi_status _prelu_op_compute { vsi_status status = VSI_FAILURE; vsi_nn_prelu_param *prelu = &self->nn_param.prelu; - int32_t shapes[VSI_NN_MAX_DIM_NUM] = { 1 }; + vsi_ssize_t shapes[VSI_NN_MAX_DIM_NUM] = { 1 }; vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; vsi_bool one_rank = FALSE; vsi_bool is_per_channel_alpha = 0; - uint32_t alpha_shape = 1; + vsi_size_t alpha_shape = 1; uint32_t i = 0; vsi_nn_kernel_param_t * param = NULL; uint32_t dims = outputs[0]->attr.dim_num; @@ -110,25 +110,25 @@ static vsi_status _prelu_op_compute } else { - memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(int32_t)); + memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(vsi_size_t)); dims = inputs[1]->attr.dim_num; } reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, - inputs[1], (uint32_t*)shapes, dims ); + inputs[1], (vsi_size_t*)shapes, dims ); } else { - memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(int32_t)); + memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(vsi_size_t)); reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, - inputs[1], (uint32_t*)shapes, inputs[1]->attr.dim_num ); + inputs[1], (vsi_size_t*)shapes, inputs[1]->attr.dim_num ); } } else { dims = inputs[1]->attr.dim_num; - memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(int32_t)); + memcpy(shapes, inputs[1]->attr.size, inputs[1]->attr.dim_num * sizeof(vsi_size_t)); if (one_rank) { @@ -143,7 +143,7 @@ static vsi_status _prelu_op_compute } reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, - inputs[1], (uint32_t*)shapes, dims ); + inputs[1], (vsi_size_t*)shapes, dims ); } // Add params diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c index 17ac2bb..3ef8224 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_proposal.c @@ -273,7 +273,7 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { - int32_t size[VSI_NN_MAX_DIM_NUM]; + vsi_size_t size[VSI_NN_MAX_DIM_NUM]; uint32_t dim; vx_tensor rois_tmp, score_tmp; @@ -292,7 +292,19 @@ static vsi_status op_optimize { size[2] = outputs[0]->attr.size[0]; size[3] = outputs[0]->attr.size[1]; +#ifdef VSI_40BIT_VA_SUPPORT rois_tmp = vxReshapeTensor(outputs[0]->t, size, dim); +#else + { + vsi_size_t i; + int32_t size_32bit[VSI_NN_MAX_DIM_NUM]; + for(i = 0; i< VSI_NN_MAX_DIM_NUM; i++) + { + size_32bit[i] = (int32_t)size[i]; + } + rois_tmp = vxReshapeTensor(outputs[0]->t, size_32bit, dim); + } +#endif if(NULL == rois_tmp) { goto error; @@ -305,7 +317,19 @@ static vsi_status op_optimize { size[2] = outputs[1]->attr.size[0]; size[3] = outputs[1]->attr.size[1]; +#ifdef VSI_40BIT_VA_SUPPORT score_tmp = vxReshapeTensor(outputs[1]->t, size, dim); +#else + { + vsi_size_t i; + int32_t size_32bit[VSI_NN_MAX_DIM_NUM]; + for(i = 0; i< VSI_NN_MAX_DIM_NUM; i++) + { + size_32bit[i] = (int32_t)size[i]; + } + score_tmp = vxReshapeTensor(outputs[1]->t, size_32bit, dim); + } +#endif if(NULL == score_tmp) { goto error; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c index 6c7bda0..e61d9f2 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c @@ -76,12 +76,12 @@ static vsi_status op_comput_reduce_mean(vsi_nn_node_t * self, } static vsi_bool caculate_reshape_size(uint32_t* dim_value, - uint32_t* re_sizes, uint32_t* re_sizes2, + vsi_size_t* re_sizes, vsi_size_t* re_sizes2, vx_int32 *resolved_dim, vx_int32 resolved_dim_count) { #define VSI_NN_MAX_IMAGE_WIDTH (65536) vsi_bool enable_reshape = TRUE; - uint32_t size_count = 1; + vsi_size_t size_count = 1; uint32_t i = 0; uint32_t dim_num = *dim_value; if (dim_num > 4) @@ -167,8 +167,8 @@ static vsi_status op_compute vx_int32 resolved_dim[4] = {-1, -1, -1, -1}; vx_int32 resolved_dim_count = 0; uint32_t i = 0; - uint32_t re_sizes[VSI_NN_MAX_DIM_NUM] = {1}; - uint32_t re_sizes2[VSI_NN_MAX_DIM_NUM] = {1}; + vsi_size_t re_sizes[VSI_NN_MAX_DIM_NUM] = {1}; + vsi_size_t re_sizes2[VSI_NN_MAX_DIM_NUM] = {1}; uint32_t dim_num; vsi_nn_tensor_t *mean_tmp_tensor = NULL; vsi_nn_tensor_t *reshaped_input1 = self->nn_param.reduce.local2->reshaped_input1; @@ -535,18 +535,18 @@ static void op_set_reduce_param_value(vsi_nn_nn_param_t *nn_param, } static vsi_bool optimzation_input_size( - const int32_t* shape_x, const size_t rank_x, - int32_t* out_shape_x, int32_t* out_rank_x, - const int32_t* resolved_dim, const int32_t resolved_dim_count, - int32_t* resolved_dim_out, int32_t* resolved_dim_out_count + const vsi_size_t* shape_x, const vsi_size_t rank_x, + vsi_size_t* out_shape_x, vsi_size_t* out_rank_x, + const vsi_size_t* resolved_dim, const vsi_size_t resolved_dim_count, + vsi_size_t* resolved_dim_out, vsi_size_t* resolved_dim_out_count ) { - int32_t i, j, k, out_i; + vsi_size_t i, j, k, out_i; vx_bool is_change = vx_false_e; - int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; - int32_t shape_out[VSI_NN_MAX_DIM_NUM] = { 0 }; - int32_t rank_out; - int32_t dim_out; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t shape_out[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t rank_out; + vsi_size_t dim_out; out_i = 0; for (i = 0; i < resolved_dim[0]; i++) @@ -576,7 +576,7 @@ static vsi_bool optimzation_input_size( if (is_change) { vsi_nn_kernel_optimize_element_shape( - shape, (size_t)j, + shape, j, shape_out, &rank_out ); if (2 == rank_out && 1 == shape_out[1]) { @@ -606,7 +606,7 @@ static vsi_bool optimzation_input_size( if (is_change) { vsi_nn_kernel_optimize_element_shape( - shape, (size_t)j, + shape, j, shape_out, &rank_out ); if (2 == rank_out && 1 == shape_out[1]) { @@ -624,7 +624,7 @@ static vsi_bool optimzation_input_size( out_shape_x[out_i++] = shape_x[resolved_dim[resolved_dim_count - 1]]; } - for (i = resolved_dim[resolved_dim_count - 1] + 1; i < (int32_t)rank_x; i++) + for (i = resolved_dim[resolved_dim_count - 1] + 1; i < rank_x; i++) { out_shape_x[out_i++] = shape_x[i]; } @@ -643,14 +643,14 @@ static vsi_bool optimzation_input_size( static vsi_bool op_set_reduce_axis( vsi_nn_node_t * self, vsi_nn_tensor_t ** inputs, - int32_t* out_shape_x, int32_t* out_rank_x + vsi_size_t* out_shape_x, vsi_size_t* out_rank_x ) { - uint32_t i = 0, j = 0; - int32_t resolved_dim[4] = {-1, -1, -1, -1}; - int32_t resolved_dim2[4] = {-1, -1, -1, -1}; - int32_t resolved_dim_count = 0; - int32_t resolved_dim_count2 = 0; + vsi_size_t i = 0, j = 0; + vsi_ssize_t resolved_dim[4] = {-1, -1, -1, -1}; + vsi_ssize_t resolved_dim2[4] = {-1, -1, -1, -1}; + vsi_size_t resolved_dim_count = 0; + vsi_size_t resolved_dim_count2 = 0; vsi_bool is_loop = TRUE; for (i = 0; i < self->nn_param.reduce.axis_num; i++) @@ -681,7 +681,7 @@ static vsi_bool op_set_reduce_axis( { if (resolved_dim[j] < resolved_dim[j - 1]) { - vx_int32 temp = 0; + vsi_ssize_t temp = 0; temp = resolved_dim[j]; resolved_dim[j] = resolved_dim[j - 1]; resolved_dim[j - 1] = temp; @@ -721,17 +721,17 @@ static vsi_bool op_set_reduce_axis( else { optimzation_input_size( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, - out_shape_x, out_rank_x, resolved_dim, resolved_dim_count, - resolved_dim2, &resolved_dim_count2 ); + inputs[0]->attr.size, inputs[0]->attr.dim_num, + out_shape_x, out_rank_x, (vsi_size_t*)resolved_dim, resolved_dim_count, + (vsi_size_t*)resolved_dim2, &resolved_dim_count2 ); } for (i = 0; i < (uint32_t)resolved_dim_count2; i++) { - self->nn_param.reduce.local2->axes[i] = resolved_dim2[i]; + self->nn_param.reduce.local2->axes[i] = (int32_t)resolved_dim2[i]; } - self->nn_param.reduce.local2->axes_num = resolved_dim_count2; + self->nn_param.reduce.local2->axes_num = (int32_t)resolved_dim_count2; return TRUE; } @@ -750,8 +750,8 @@ static vsi_bool op_set_reduce_internal vsi_nn_internal_node_t* curr = NULL; vsi_nn_internal_tensor_t* tmp_output_tensor[2] = {NULL, NULL}; vsi_bool use_virtual_tensor = TRUE; - uint32_t re_sizes[VSI_NN_MAX_DIM_NUM] = {1}; - uint32_t re_sizes2[VSI_NN_MAX_DIM_NUM] = {1}; + vsi_size_t re_sizes[VSI_NN_MAX_DIM_NUM] = {1}; + vsi_size_t re_sizes2[VSI_NN_MAX_DIM_NUM] = {1}; vsi_nn_tensor_t* new_output = NULL; uint32_t dim_num; vx_int32 resolved_dim_count = 0; @@ -979,8 +979,8 @@ static vsi_bool op_setup vsi_bool ret = TRUE; vsi_nn_tensor_t* reshape_in_t[1] = { NULL }; vsi_nn_tensor_t* reshape_out_t[1] = { NULL }; - int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; - int32_t new_rank = 0; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t new_rank = 0; int32_t j; if (self->nn_param.reduce.type != VSI_NN_REDUCE_MEAN && self->nn_param.reduce.type != VSI_NN_REDUCE_SUM && @@ -1052,7 +1052,7 @@ static vsi_bool op_setup return FALSE; } reshape_in_t[0] = vsi_nn_reshape_tensor( self->graph, - inputs[0], (uint32_t*)shape, new_rank ); + inputs[0], shape, new_rank ); self->nn_param.reduce.local2->reshaped_input1 = reshape_in_t[0]; for (j = 0; j < self->nn_param.reduce.local2->axes_num; j++) @@ -1061,7 +1061,7 @@ static vsi_bool op_setup } reshape_out_t[0] = vsi_nn_reshape_tensor( self->graph, - outputs[0], (uint32_t*)shape, new_rank ); + outputs[0], shape, new_rank ); self->nn_param.reduce.local2->reshaped_output1 = reshape_out_t[0]; if (self->nn_param.reduce.type == VSI_NN_REDUCE_SUM) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_internal.c index 1bcc83f..081f287 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce_internal.c @@ -51,7 +51,7 @@ static vsi_status _reduce_internal_op_compute { vsi_status status; vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; - int32_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } }; uint32_t rank_in = 0; uint32_t rank_out = 0; int32_t axis = 0; @@ -103,9 +103,9 @@ static vsi_status _reduce_internal_op_compute } ret = vsi_nn_kernel_optimize_reduce_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[0]->attr.size, inputs[0]->attr.dim_num, &axis, 1, - (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + outputs[0]->attr.size, outputs[0]->attr.dim_num, shapes[0], &rank_in, shapes[1], &rank_out, &new_axis, &axis_size); @@ -115,9 +115,9 @@ static vsi_status _reduce_internal_op_compute if( ret ) { reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, - inputs[0], (uint32_t*)shapes[0], rank_in ); + inputs[0], shapes[0], rank_in ); reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, - outputs[0], (uint32_t*)shapes[1], rank_out ); + outputs[0], shapes[1], rank_out ); self->n = (vx_node)vsi_nn_kernel_selector( self->graph, kernel_name, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c index a77d54e..dd41b6a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reducesum_internal.c @@ -45,7 +45,7 @@ static vsi_status op_compute ) { vsi_status status = VSI_FAILURE; - int32_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = { { 0 } }; uint32_t rank_in = 0; uint32_t rank_out = 0; int32_t new_axis[VSI_NN_MAX_DIM_NUM]; @@ -53,10 +53,10 @@ static vsi_status op_compute vsi_bool ret; ret = vsi_nn_kernel_optimize_reduce_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[0]->attr.size, inputs[0]->attr.dim_num, (int32_t *)(self->nn_param.reducesum_internal.axis), self->nn_param.reducesum_internal.axis_num, - (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + outputs[0]->attr.size, outputs[0]->attr.dim_num, shapes[0], &rank_in, shapes[1], &rank_out, new_axis, &axis_size); @@ -64,10 +64,10 @@ static vsi_status op_compute { self->nn_param.reducesum_internal.local->reshaped_input = vsi_nn_reshape_tensor( self->graph, - inputs[0], (uint32_t*)shapes[0], rank_in ); + inputs[0], shapes[0], rank_in ); self->nn_param.reducesum_internal.local->reshaped_output = vsi_nn_reshape_tensor( self->graph, - outputs[0], (uint32_t*)shapes[1], rank_out ); + outputs[0], shapes[1], rank_out ); self->n = vxTensorReduceSumNode( self->graph->g, self->nn_param.reducesum_internal.local->reshaped_input->t, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c index 2f08f5e..211ab7d 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relational_ops.c @@ -48,8 +48,8 @@ static vsi_status _comparisons_op_compute { vsi_status status; vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; - int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; - uint32_t new_rank = 0; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + vsi_size_t new_rank = 0; vsi_bool ret; vsi_nn_kernel_param_t * param = NULL; vsi_nn_relational_ops_type_t op_type = self->nn_param.relational_ops.op; @@ -63,19 +63,19 @@ static vsi_status _comparisons_op_compute // TODO: This optimzie is a hack for gpu path, // it should be moved to gpu kernel setup. ret = vsi_nn_kernel_optimize_eltwise_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, - (int32_t *)inputs[1]->attr.size, inputs[1]->attr.dim_num, - (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[1]->attr.size, inputs[1]->attr.dim_num, + outputs[0]->attr.size, outputs[0]->attr.dim_num, shapes[0], shapes[1], shapes[2], &new_rank ); if( ret ) { // Add params reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, - inputs[0], (uint32_t*)shapes[0], new_rank ); + inputs[0], shapes[0], new_rank ); reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, - inputs[1], (uint32_t*)shapes[1], new_rank ); + inputs[1], shapes[1], new_rank ); reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, - outputs[0], (uint32_t*)shapes[2], new_rank ); + outputs[0], shapes[2], new_rank ); if (shapes[1][3] > shapes[0][3] && new_rank == 4) { @@ -180,8 +180,8 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { - uint32_t i, out_rank, in1_rank, in2_rank; - uint32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t i, out_rank, in1_rank, in2_rank; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_bool ret = TRUE; in1_rank = inputs[0]->attr.dim_num; @@ -190,26 +190,26 @@ static vsi_bool op_setup for(i = 0; i < out_rank; i++) { - uint32_t sz0, sz1; + vsi_size_t sz0, sz1; sz0 = i < in1_rank ? inputs[0]->attr.size[i] : 1; sz1 = i < in2_rank ? inputs[1]->attr.size[i] : 1; shape[i] = vsi_nn_max( sz0, sz1 ); } if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { - outputs[0]->attr.dim_num = out_rank; - memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(uint32_t) ); + outputs[0]->attr.dim_num = (uint32_t)out_rank; + memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) ); } else { - uint32_t total_size_got; - uint32_t total_size_expected; + vsi_size_t total_size_got; + vsi_size_t total_size_expected; total_size_expected = vsi_nn_ShapeProduct( shape, out_rank ); total_size_got = vsi_nn_ShapeProduct( outputs[0]->attr.size, outputs[0]->attr.dim_num ); if( total_size_expected != total_size_got ) { - VSILOGW("Output size mismatch, expect %d, but got %d", + VSILOGW("Output size mismatch, expect %"VSI_SIZE_T_SPECIFIER", but got %"VSI_SIZE_T_SPECIFIER"", total_size_expected, total_size_got); ret = FALSE; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relu.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relu.c index 7045e61..fd44f36 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_relu.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relu.c @@ -68,43 +68,11 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { - /* check inputs outputs data type */ - BEGIN_IO_TYPE_DECL(RELU, 1, 1) - /* IO_TYPE(INPUT, OUTPUT) */ - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F32, D_F16) - IO_TYPE(D_F32, D_BF16) + vsi_bool ret = FALSE; - IO_TYPE(D_F16, D_F32) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_F16, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_I8|Q_DFP) - IO_TYPE(D_F16, D_I16|Q_DFP) + ret = vsi_nn_OpCheck(VSI_NN_OP_RSQRT, self, inputs, outputs); - IO_TYPE(D_BF16, D_BF16) - IO_TYPE(D_BF16, D_F32) - - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_F16) - - IO_TYPE(D_I8|Q_ASYM, D_I8|Q_ASYM) - IO_TYPE(D_I8|Q_ASYM, D_F16) - - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16) - - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_F16) - END_IO_TYPE_DECL(RELU) - if(!VALIDATE_OP_IO_TYPES(RELU, self, inputs, self->input.num, outputs, self->output.num)) { - char* desc = generate_op_io_types_desc(inputs, - self->input.num, outputs, self->output.num); - VSILOGE("Inputs/Outputs data type not support: %s", desc); - destroy_op_io_types_desc(desc); - return FALSE; - } - - return TRUE; + return ret; } /* op_check() */ #ifdef __cplusplus @@ -126,4 +94,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras_internal.c index aa5e8f5..afdb226 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_relu_keras_internal.c @@ -52,8 +52,8 @@ static vsi_status op_compute { vsi_status status = VSI_FAILURE; vsi_nn_tensor_t* reshape_tensors[2] = { NULL }; - int32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; - int32_t new_rank = 0; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t new_rank = 0; vsi_bool ret; vsi_nn_kernel_param_t * param = NULL; vsi_nn_relu_keras_internal_param * p = NULL; @@ -73,7 +73,7 @@ static vsi_status op_compute param = vsi_nn_kernel_param_create(); ret = vsi_nn_kernel_optimize_element_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, + inputs[0]->attr.size, inputs[0]->attr.dim_num, shape, &new_rank ); vsi_nn_kernel_param_add_float32( param, "alpha", alpha ); @@ -83,9 +83,9 @@ static vsi_status op_compute if( ret ) { reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, - inputs[0], (uint32_t*)shape, new_rank ); + inputs[0], shape, new_rank ); reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, - outputs[0], (uint32_t*)shape, new_rank ); + outputs[0], shape, new_rank ); self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "relu_keras", diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reorg.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reorg.c index 10731cd..37e1082 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reorg.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reorg.c @@ -67,31 +67,11 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { - BEGIN_IO_TYPE_DECL(REORG, 1, 1) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_F16, D_I8|Q_DFP) - IO_TYPE(D_I8|Q_DFP, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F16, D_F32) - IO_TYPE(D_F32, D_F16) - IO_TYPE(D_F32, D_BF16) - IO_TYPE(D_BF16, D_F32) + vsi_bool ret = FALSE; - /* HW 9.0 */ - IO_TYPE(D_BF16, D_BF16) - END_IO_TYPE_DECL(REORG) - if(!VALIDATE_OP_IO_TYPES(REORG, self, inputs, self->input.num, outputs, self->output.num)) { - char* desc = generate_op_io_types_desc(inputs, - self->input.num, outputs, self->output.num); - VSILOGE("Inputs/Outputs data type not support: %s", desc); - destroy_op_io_types_desc(desc); - return FALSE; - } + ret = vsi_nn_OpCheck(VSI_NN_OP_STRIDED_SLICE, self, inputs, outputs); - return TRUE; + return ret; } /* op_check() */ static vsi_bool op_setup @@ -107,7 +87,7 @@ static vsi_bool op_setup { outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; memcpy( outputs[0]->attr.size, inputs[0]->attr.size, - VSI_NN_MAX_DIM_NUM * sizeof( uint32_t ) ); + VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) ); outputs[0]->attr.size[0] = inputs[0]->attr.size[0] / stride; outputs[0]->attr.size[1] = inputs[0]->attr.size[1] / stride; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c index 3200fe5..a10cbe6 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_repeat.c @@ -51,7 +51,7 @@ static vsi_status _create_local_tensor int32_t* repeat_host = self->nn_param.repeat.repeat_host; int32_t axis = self->nn_param.repeat.axis; vsi_nn_repeat_lcl_data *local = self->nn_param.repeat.local; - uint32_t shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {1, 1, 1, 1}; uint32_t i = 0; if (axis == -1) @@ -75,7 +75,7 @@ static vsi_status _create_local_tensor if (repeat_host) { vsi_nn_tensor_attr_t attr; - int32_t len = 0; + vsi_ssize_t len = 0; if (self->nn_param.repeat.axis < 0) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c index 91fcc24..daeb768 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reshape.c @@ -83,9 +83,9 @@ static vsi_bool op_setup vsi_bool ret = TRUE; if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { - uint32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = {0}; memcpy(shape, self->nn_param.reshape.size, - sizeof(uint32_t) * self->nn_param.reshape.dim_num); + sizeof(vsi_size_t) * self->nn_param.reshape.dim_num); ret = vsi_nn_CalcReshapeTensor(inputs[0], outputs[0], shape, @@ -115,8 +115,13 @@ static vsi_status op_optimize { if(NULL == inputs[0]->t && NULL != outputs[0]->t) { +#ifdef VSI_40BIT_VA_SUPPORT inputs[0]->t = vxReshapeTensor( outputs[0]->t, - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num ); + inputs[0]->attr.size, inputs[0]->attr.dim_num ); +#else + inputs[0]->t = vxReshapeTensor( outputs[0]->t, + (int32_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num ); +#endif if( inputs[0]->t == NULL ) { status = VSI_FAILURE; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c index 255388e..9454c42 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize.c @@ -54,7 +54,7 @@ static vsi_bool _is_same_shape ( vsi_nn_tensor_t * inputs, - uint32_t *sizes, + vsi_size_t *sizes, uint32_t dims ) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d.c index 235d0c3..c05ec67 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_resize_1d.c @@ -44,7 +44,7 @@ static vsi_bool _is_same_shape ( vsi_nn_tensor_t * inputs, - uint32_t *sizes, + vsi_size_t *sizes, uint32_t dims ) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c index 721da3b..ec8b441 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rnncell_ovxlib.c @@ -80,8 +80,8 @@ static vsi_bool setup_op_shapes { vsi_nn_tensor_attr_t attr; vsi_nn_internal_tensor_t* output_tensor = NULL; - uint32_t output_size = 0; - uint32_t batch_size = 0; + vsi_size_t output_size = 0; + vsi_size_t batch_size = 0; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); batch_size = inputs[RNNCELL_INPUT_INPUT]->attr.size[1]; @@ -103,7 +103,7 @@ static vsi_bool setup_op_shapes if( !outputs[RNNCELL_OUTPUT_H_STATE] ) { - memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); attr.dim_num = VSI_NN_DIM_AUTO; memcpy( &attr.dtype, &outputs[RNNCELL_OUTPUT_OUTPUT]->attr.dtype, sizeof( attr.dtype ) ); attr.vtl = TRUE; @@ -129,7 +129,7 @@ static vsi_bool setup_op_shapes outputs[RNNCELL_OUTPUT_OUTPUT]->attr.dim_num; memcpy( outputs[RNNCELL_OUTPUT_H_STATE]->attr.size, outputs[RNNCELL_OUTPUT_OUTPUT]->attr.size, - VSI_NN_MAX_DIM_NUM * sizeof( uint32_t ) ); + VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t) ); } return TRUE; } @@ -165,7 +165,7 @@ static vsi_bool op_setup memset(p->local, 0x00, sizeof(vsi_nn_rnncell_ovxlib_lcl_data_t)); memset(&attr, 0x00, sizeof(attr)); - p->local->multi_batch = (inputs[RNNCELL_INPUT_INPUT]->attr.size[1]); + p->local->multi_batch = (vsi_bool)(inputs[RNNCELL_INPUT_INPUT]->attr.size[1]); if( inputs[RNNCELL_INPUT_INPUT]->attr.dtype.qnt_type != inputs[RNNCELL_INPUT_WEIGHT_I]->attr.dtype.qnt_type) @@ -226,7 +226,7 @@ static vsi_bool op_setup { /* reshape and transpose input */ vsi_nn_rnn_find_best_kernel_size(p->local->multi_batch, - inputs[RNNCELL_INPUT_INPUT]->attr.size[0], + (uint32_t)inputs[RNNCELL_INPUT_INPUT]->attr.size[0], &kernel_h, &kernel_w); input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[RNNCELL_INPUT_INPUT], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); @@ -245,7 +245,7 @@ static vsi_bool op_setup { /* reshape and transpose input */ vsi_nn_rnn_find_best_kernel_size(p->local->multi_batch, - inputs[RNNCELL_INPUT_AUX_INPUT]->attr.size[0], + (uint32_t)inputs[RNNCELL_INPUT_AUX_INPUT]->attr.size[0], &kernel_h, &kernel_w); input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[RNNCELL_INPUT_AUX_INPUT], @@ -278,7 +278,7 @@ static vsi_bool op_setup { /* reshape and transpose input */ vsi_nn_rnn_find_best_kernel_size(p->local->multi_batch, - inputs[RNNCELL_INPUT_H_STATE]->attr.size[0], &kernel_h, &kernel_w); + (uint32_t)inputs[RNNCELL_INPUT_H_STATE]->attr.size[0], &kernel_h, &kernel_w); hstate_input_tensor = vsi_nn_rnn_process_input_for_nn_fc(self, inputs[RNNCELL_INPUT_H_STATE], p->local->multi_batch, kernel_h, kernel_w, use_virtual_tensor); diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c index 9470ffc..7c39210 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_roi_pool.c @@ -151,7 +151,7 @@ static vsi_status op_optimize vsi_nn_opt_direction_e direction ) { - int32_t size[VSI_NN_MAX_DIM_NUM]; + vsi_size_t size[VSI_NN_MAX_DIM_NUM]; uint32_t dim; vx_tensor rois_tmp; @@ -170,7 +170,19 @@ static vsi_status op_optimize { size[2] = inputs[1]->attr.size[0]; size[3] = inputs[1]->attr.size[1]; +#ifdef VSI_40BIT_VA_SUPPORT rois_tmp = vxReshapeTensor(inputs[1]->t, size, dim); +#else + { + vsi_size_t i; + int32_t size_32bit[VSI_NN_MAX_DIM_NUM]; + for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + size_32bit[i] = (int32_t)size[i]; + } + rois_tmp = vxReshapeTensor(inputs[1]->t, size_32bit, dim); + } +#endif if(NULL == rois_tmp) { return VSI_FAILURE; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c index 0a90083..90395c0 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rsqrt.c @@ -69,22 +69,43 @@ static vsi_bool op_check ) { BEGIN_IO_TYPE_DECL(RSQRT, 1, 1) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_I8|Q_DFP) - IO_TYPE(D_F16, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_I16|Q_DFP, D_F16) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_BF16, D_BF16) - IO_TYPE(D_BF16, D_F32) - IO_TYPE(D_F32, D_BF16) - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F32, D_U8|Q_ASYM) - IO_TYPE(D_U8|Q_ASYM, D_F32) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_F32, D_U8|Q_ASYM) + IO_TYPE(D_U8|Q_ASYM, D_F32) + + /* HW 9.0.1 */ + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP) + + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_F32) + + IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_F32) + + IO_TYPE(D_F16, D_BF16) + IO_TYPE(D_F16, D_F32) + END_IO_TYPE_DECL(RSQRT) if(!VALIDATE_OP_IO_TYPES(RSQRT, self, inputs, self->input.num, outputs, self->output.num)) { char* desc = generate_op_io_types_desc(inputs, diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c index 3c28b02..c04009a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd.c @@ -51,12 +51,12 @@ static vsi_status op_compute uint32_t i = 0; uint32_t block_size = 1, coord_dim = 1; uint32_t idx_num = 1; - uint32_t *input_size = inputs[1]->attr.size; + vsi_size_t *input_size = inputs[1]->attr.size; uint32_t dims_num = inputs[1]->attr.dim_num; if(inputs[0]->attr.dim_num > 1) { - coord_dim = inputs[0]->attr.size[0]; + coord_dim = (uint32_t)inputs[0]->attr.size[0]; } if( coord_dim > 3 ) { @@ -65,7 +65,7 @@ static vsi_status op_compute } for(i = 0; i < inputs[0]->attr.dim_num; i++) { - idx_num *= inputs[0]->attr.size[i]; + idx_num *= (uint32_t)inputs[0]->attr.size[i]; } idx_num /= coord_dim; @@ -73,7 +73,7 @@ static vsi_status op_compute for(i = 0; i < dims_num; ++i) { - block_size *= input_size[i]; + block_size *= (uint32_t)input_size[i]; } block_size /= idx_num; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c index 190b04c..e2897e4 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_scatter_nd_update.c @@ -51,12 +51,12 @@ static vsi_status op_compute uint32_t i = 0; uint32_t block_size = 1, coord_dim = 1; uint32_t idx_num = 1; - uint32_t *input_size = inputs[2]->attr.size; + vsi_size_t *input_size = inputs[2]->attr.size; uint32_t dims_num = inputs[2]->attr.dim_num; if (inputs[1]->attr.dim_num > 1) { - coord_dim = inputs[1]->attr.size[0]; + coord_dim = (uint32_t)inputs[1]->attr.size[0]; } if ( coord_dim > 4 && input_size[dims_num - 1] > 1) { @@ -65,7 +65,7 @@ static vsi_status op_compute } for(i = 0; i < inputs[1]->attr.dim_num; i++) { - idx_num *= inputs[1]->attr.size[i]; + idx_num *= (uint32_t)inputs[1]->attr.size[i]; } idx_num /= coord_dim; @@ -73,7 +73,7 @@ static vsi_status op_compute for(i = 0; i < dims_num; ++i) { - block_size *= input_size[i]; + block_size *= (uint32_t)input_size[i]; } block_size /= idx_num; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_select.c b/src/tim/vx/internal/src/ops/vsi_nn_op_select.c index a01b758..fd01b8a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_select.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_select.c @@ -49,10 +49,10 @@ static vsi_status op_compute { vsi_status status = VSI_FAILURE; vsi_nn_tensor_t* reshape_tensors[_IO_NUM] = { NULL }; - int32_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; - int32_t* shapes_ptr[_IO_NUM]; - int32_t* shapes_in[_INPUT_NUM]; - size_t rank_in[_INPUT_NUM]; + vsi_size_t shapes[_IO_NUM][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + vsi_size_t* shapes_ptr[_IO_NUM]; + vsi_size_t* shapes_in[_INPUT_NUM]; + vsi_size_t rank_in[_INPUT_NUM]; uint32_t new_rank = 0; int32_t i = 0; vsi_bool ret; @@ -69,13 +69,13 @@ static vsi_status op_compute for (i = 0; i < _INPUT_NUM; i++) { - shapes_in[i] = (int32_t *)inputs[i]->attr.size; - rank_in[i] = (size_t)inputs[i]->attr.dim_num; + shapes_in[i] = inputs[i]->attr.size; + rank_in[i] = (vsi_size_t)inputs[i]->attr.dim_num; } ret = vsi_nn_kernel_optimize_broadcast_shape( - (const int32_t**)shapes_in, (const size_t*)rank_in, _INPUT_NUM, - (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + (const vsi_size_t**)shapes_in, rank_in, _INPUT_NUM, + outputs[0]->attr.size, outputs[0]->attr.dim_num, shapes_ptr, shapes[_INPUT_NUM], &new_rank); if( ret ) @@ -83,13 +83,13 @@ static vsi_status op_compute for (i = 0; i < _INPUT_NUM; i++) { reshape_tensors[i] = vsi_nn_reshape_tensor( self->graph, - inputs[i], (uint32_t*)shapes[i], new_rank ); + inputs[i], shapes[i], new_rank ); } for (i = 0; i < _OUTPUT_NUM; i++) { reshape_tensors[i + _INPUT_NUM] = vsi_nn_reshape_tensor( self->graph, - outputs[i], (uint32_t*)shapes[i + _INPUT_NUM], new_rank ); + outputs[i], shapes[i + _INPUT_NUM], new_rank ); } self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "select", @@ -152,8 +152,8 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { - uint32_t i, out_rank, in0_rank, in1_rank, in2_rank; - uint32_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t i, out_rank, in0_rank, in1_rank, in2_rank; + vsi_size_t shape[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_bool ret = TRUE; in0_rank = inputs[0]->attr.dim_num; @@ -163,7 +163,7 @@ static vsi_bool op_setup for(i = 0; i < out_rank; i++) { - uint32_t sz0, sz1, sz2; + vsi_size_t sz0, sz1, sz2; sz0 = i < in0_rank ? inputs[0]->attr.size[i] : 1; sz1 = i < in1_rank ? inputs[1]->attr.size[i] : 1; sz2 = i < in2_rank ? inputs[2]->attr.size[i] : 1; @@ -172,19 +172,19 @@ static vsi_bool op_setup if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { - outputs[0]->attr.dim_num = out_rank; - memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(uint32_t) ); + outputs[0]->attr.dim_num = (uint32_t)out_rank; + memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) ); } else { - uint32_t total_size_got; - uint32_t total_size_expected; + vsi_size_t total_size_got; + vsi_size_t total_size_expected; total_size_expected = vsi_nn_ShapeProduct( shape, out_rank ); total_size_got = vsi_nn_ShapeProduct( outputs[0]->attr.size, outputs[0]->attr.dim_num ); if( total_size_expected != total_size_got ) { - VSILOGW("Output size mismatch, expect %d, but got %d", + VSILOGW("Output size mismatch, expect %"VSI_SIZE_T_SPECIFIER", but got %"VSI_SIZE_T_SPECIFIER"", total_size_expected, total_size_got); ret = FALSE; } diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c b/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c index b45f405..4d8d7fc 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_shufflechannel.c @@ -130,7 +130,7 @@ static vsi_bool op_setup { outputs[0]->attr.dim_num = inputs[0]->attr.dim_num; memcpy( outputs[0]->attr.size, inputs[0]->attr.size, - sizeof(uint32_t) * inputs[0]->attr.dim_num ); + sizeof(vsi_size_t) * inputs[0]->attr.dim_num ); } return TRUE; @@ -147,7 +147,7 @@ static vsi_bool op_check int32_t axis = 0; int32_t dims = (int32_t)inputs[0]->attr.dim_num; int32_t num_group = 0; - uint32_t *shape = inputs[0]->attr.size; + vsi_size_t *shape = inputs[0]->attr.size; p = &(self->nn_param.shufflechannel); axis = p->axis; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_signal_frame.c b/src/tim/vx/internal/src/ops/vsi_nn_op_signal_frame.c index 4cbe3f0..620eb8e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_signal_frame.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_signal_frame.c @@ -116,8 +116,8 @@ static vsi_bool op_setup uint32_t i = 0; vsi_bool ret = 0; uint32_t axis = 0; - uint32_t num_frames = 0; - uint32_t frame_axis = 0; + vsi_size_t num_frames = 0; + vsi_size_t frame_axis = 0; uint32_t frame_step = 0; uint32_t frame_length = 0; vsi_nn_signalframe_param *p = &self->nn_param.signalframe; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c index b71c583..9d6d9d5 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax.c @@ -40,7 +40,7 @@ static vsi_bool _is_same_shape ( vsi_nn_tensor_t * inputs, - uint32_t *sizes, + vsi_size_t *sizes, uint32_t dims ) { @@ -131,7 +131,7 @@ static vsi_status op_optimize vsi_nn_internal_node_t* curr = NULL; vsi_nn_softmax_param * p; uint32_t dim_num; - uint32_t sizes[VSI_NN_MAX_DIM_NUM] = {1}; + vsi_size_t sizes[VSI_NN_MAX_DIM_NUM] = {1}; uint32_t i = 0; int32_t axis = -1; vsi_nn_tensor_t* new_input = NULL; @@ -146,8 +146,8 @@ static vsi_status op_optimize axis = p->axis; if (axis != VSI_NN_SOFTMAX_DEFAULT_AXIS) { - uint32_t innerSize = 1; - uint32_t outerSize = 1; + vsi_size_t innerSize = 1; + vsi_size_t outerSize = 1; for (i = 0; i < (uint32_t)axis; i++) { sizes[i] = inputs[0]->attr.size[i]; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c index 4d9d80e..ce0b2e4 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_softmax_internal.c @@ -173,8 +173,9 @@ static vsi_status op_optimize { vsi_status status; vx_tensor in_view_tensor,out_view_tensor; - uint32_t start[VSI_NN_MAX_DIM_NUM],end[VSI_NN_MAX_DIM_NUM]; - uint32_t axis, batch_size; + vsi_size_t start[VSI_NN_MAX_DIM_NUM],end[VSI_NN_MAX_DIM_NUM]; + uint32_t axis; + vsi_size_t batch_size; in_view_tensor = NULL; out_view_tensor = NULL; @@ -200,8 +201,8 @@ static vsi_status op_optimize axis = 1; /* we only split 2D softmax, so the axis = batch dim */ batch_size = inputs[0]->attr.size[1]; - memset( start, 0, sizeof( uint32_t ) * VSI_NN_MAX_DIM_NUM ); - memset( end, 0, sizeof( uint32_t ) * VSI_NN_MAX_DIM_NUM ); + memset( start, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM ); + memset( end, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM ); end[0] = inputs[0]->attr.size[0]; end[1] = inputs[0]->attr.size[1]; end[2] = inputs[0]->attr.size[2]; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_space2batch.c b/src/tim/vx/internal/src/ops/vsi_nn_op_space2batch.c index 72be798..599c78a 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_space2batch.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_space2batch.c @@ -111,6 +111,8 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + vsi_bool ret = FALSE; + if (inputs[0]->attr.dim_num != 4) { VSILOGE("The input tensor shape must be 4-D!(space2batch)"); @@ -128,25 +130,9 @@ static vsi_bool op_check return FALSE; } - { - BEGIN_IO_TYPE_DECL(SPACE2DEPTH, 1, 1) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F32, D_BF16) - IO_TYPE(D_BF16, D_F32) - END_IO_TYPE_DECL(SPACE2DEPTH) - if (!VALIDATE_OP_IO_TYPES(SPACE2DEPTH, self, inputs, self->input.num, outputs, self->output.num)) { - char* desc = generate_op_io_types_desc(inputs, - self->input.num, outputs, self->output.num); - VSILOGE("Inputs/Outputs data type not support: %s", desc); - destroy_op_io_types_desc(desc); - return FALSE; - } - } - return TRUE; + ret = vsi_nn_OpCheck(VSI_NN_OP_STRIDED_SLICE, self, inputs, outputs); + + return ret; } /* op_add_check() */ static vsi_bool op_setup @@ -211,4 +197,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c index e0ef8c7..d6e201e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_space2depth.c @@ -120,6 +120,8 @@ static vsi_bool op_check vsi_nn_tensor_t ** outputs ) { + vsi_bool ret = FALSE; + if (self->nn_param.space2depth.block_size[0] < 0 || self->nn_param.space2depth.block_size[1] < 0) { @@ -127,29 +129,9 @@ static vsi_bool op_check return FALSE; } - { - BEGIN_IO_TYPE_DECL(SPACE2DEPTH, 1, 1) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F32, D_BF16) - IO_TYPE(D_BF16, D_F32) + ret = vsi_nn_OpCheck(VSI_NN_OP_STRIDED_SLICE, self, inputs, outputs); - /* HW 9.0 */ - IO_TYPE(D_BF16, D_BF16) - END_IO_TYPE_DECL(SPACE2DEPTH) - if (!VALIDATE_OP_IO_TYPES(SPACE2DEPTH, self, inputs, self->input.num, outputs, self->output.num)) { - char* desc = generate_op_io_types_desc(inputs, - self->input.num, outputs, self->output.num); - VSILOGE("Inputs/Outputs data type not support: %s", desc); - destroy_op_io_types_desc(desc); - return FALSE; - } - } - - return TRUE; + return ret; } /* op_check() */ static vsi_bool op_set_space2depth_internal diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_split.c b/src/tim/vx/internal/src/ops/vsi_nn_op_split.c index 831570f..a510217 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_split.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_split.c @@ -179,9 +179,10 @@ static vsi_bool op_setup ) { vsi_bool ret; - uint32_t i,num,average; - uint32_t start[VSI_NN_MAX_DIM_NUM] = { 0 }; - uint32_t end[VSI_NN_MAX_DIM_NUM] = { 0 }; + uint32_t i,num; + vsi_size_t average; + vsi_size_t start[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t end[VSI_NN_MAX_DIM_NUM] = { 0 }; uint32_t axis = self->nn_param.split.axis; const uint32_t *slices = self->nn_param.split.slices; uint32_t slices_num = self->nn_param.split.slices_num; @@ -230,8 +231,11 @@ static vsi_bool op_setup outputs[i]->attr.size[j] = inputs[0]->attr.size[j]; } outputs[i]->attr.size[axis] = end[axis] - start[axis]; - memcpy(p->lcl_data->begin_dims, start, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); - memcpy(p->lcl_data->end_dims, end, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + for(j = 0; j < VSI_NN_MAX_DIM_NUM; j++) + { + p->lcl_data->begin_dims[j] = (int32_t)start[j]; + p->lcl_data->end_dims[j] = (int32_t)end[j]; + } curr = vsi_nn_internal_new_node( self, VSI_NN_OP_STRIDED_SLICE, 0, 0 ); curr->node->nn_param.strided_slice.begin_dims = p->lcl_data->begin_dims; curr->node->nn_param.strided_slice.begin_dims_num = inputs[0]->attr.dim_num; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_square.c b/src/tim/vx/internal/src/ops/vsi_nn_op_square.c index 1124ef7..1e0144c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_square.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_square.c @@ -32,6 +32,7 @@ #include "vsi_nn_tensor.h" #include "vsi_nn_tensor_util.h" #include "utils/vsi_nn_util.h" +#include "kernel/vsi_nn_kernel.h" static vsi_status op_compute @@ -41,22 +42,16 @@ static vsi_status op_compute vsi_nn_tensor_t ** outputs ) { - vsi_status status; - status = VSI_FAILURE; + vsi_status status = VSI_FAILURE; - self->n = vxActivationLayer( - self->graph->g, - inputs[0]->t, - VX_NN_ACTIVATION_SQUARE, - 0, - 0, - outputs[0]->t - ); + self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "square", + inputs, 1, outputs, 1, NULL ); - if( NULL != self->n ) + if( self->n ) { status = VSI_SUCCESS; } + return status; } /* op_compute() */ @@ -93,4 +88,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c b/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c index 7f47cfa..a514514 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_stack.c @@ -76,11 +76,11 @@ static vsi_bool op_setup { vsi_nn_stack_param * p; uint32_t i, j; - uint32_t block_size = 1; - uint32_t block_num = 1; + vsi_size_t block_size = 1; + vsi_size_t block_num = 1; uint32_t axis; - uint32_t input_shape[2] = {1, 1}; - uint32_t output_shape[2] = {1, 1}; + vsi_size_t input_shape[2] = {1, 1}; + vsi_size_t output_shape[2] = {1, 1}; vsi_nn_internal_node_t* curr = NULL; vsi_nn_tensor_t *output_rs = NULL; vsi_nn_stack_lcl_data * data; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c index 0d84833..f743a52 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c @@ -131,14 +131,14 @@ static vsi_bool _get_stride_slice_start_stop_stride { int32_value = params->begin_dims[i]; - start[i] = get_slice_axis_value(int32_value, inputs[0]->attr.size[i]); + start[i] = get_slice_axis_value(int32_value, (uint32_t)inputs[0]->attr.size[i]); } for (i = 0; i < params->end_dims_num; ++i) { int32_value = params->end_dims[i]; - stop[i] = get_slice_axis_value(int32_value, inputs[0]->attr.size[i]); + stop[i] = get_slice_axis_value(int32_value, (uint32_t)inputs[0]->attr.size[i]); } /*if the ith bit of mask is set, the start or stop will be the fullest possible range in that dimension.*/ @@ -146,7 +146,7 @@ static vsi_bool _get_stride_slice_start_stop_stride { if (params->begin_mask & (1 << i)) { - start[i] = get_slice_mask_start_value(stride[i], inputs[0]->attr.size[i]); + start[i] = get_slice_mask_start_value(stride[i], (uint32_t)inputs[0]->attr.size[i]); } start[i] = vsi_nn_clamp(start[i], 0, (vx_int32)(inputs[0]->attr.size[i] - 1)); @@ -156,12 +156,12 @@ static vsi_bool _get_stride_slice_start_stop_stride stop[i] = start[i] + 1; } - if (p->end_mask & (1 << i)) + if (params->end_mask & (1 << i)) { - stop[i] = get_slice_mask_stop_value(stride[i], inputs[0]->attr.size[i]); + stop[i] = (int32_t)get_slice_mask_stop_value(stride[i], (uint32_t)inputs[0]->attr.size[i]); } - stop[i] = get_slice_clamp_stop(stride[i], stop[i], inputs[0]->attr.size[i]); + stop[i] = (int32_t)get_slice_clamp_stop(stride[i], stop[i], (uint32_t)inputs[0]->attr.size[i]); } /* reset start stop and stride when output size is 1*/ @@ -189,13 +189,13 @@ static vsi_bool _get_stride_slice_start_stop_stride static vsi_bool _check_is_same_shape( vsi_nn_tensor_t ** inputs, - int32_t *start, - int32_t *stop, - int32_t *stride + vsi_ssize_t *start, + vsi_ssize_t *stop, + vsi_ssize_t *stride ) { - int32_t i = 0; - int32_t dims = (int32_t)inputs[0]->attr.dim_num; + vsi_ssize_t i = 0; + vsi_ssize_t dims = (vsi_ssize_t)inputs[0]->attr.dim_num; for (i = dims - 1; i >= 0; i --) { @@ -210,7 +210,7 @@ static vsi_bool _check_is_same_shape( for (i = 0; i < dims - 1; i++) { - if (stride[i] != 1 || start[i] != 0 || stop[i] != (int32_t)inputs[0]->attr.size[i]) + if (stride[i] != 1 || start[i] != 0 || stop[i] != (vsi_ssize_t)inputs[0]->attr.size[i]) return FALSE; } @@ -254,7 +254,14 @@ static vsi_status copy_tensor_to_view data = self->nn_param.strided_slice.lcl2_data; data->src_tensor = src_tensor; if (dst_in->t) + { +#ifdef VSI_40BIT_VA_SUPPORT + data->dst_tensor = vxReshapeTensor(dst_in->t, dst_in->attr.size, dst_in->attr.dim_num); +#else data->dst_tensor = vxReshapeTensor(dst_in->t, (int32_t*)dst_in->attr.size, dst_in->attr.dim_num); +#endif + } + data->is_dataconvert_op = TRUE; return ret; @@ -302,7 +309,7 @@ static vsi_status op_compute } else { - uint32_t sizes[VSI_NN_MAX_DIM_NUM] = {1}; + vsi_size_t sizes[VSI_NN_MAX_DIM_NUM] = {1}; uint32_t dims = inputs[0]->attr.dim_num; int32_t shrink_axis_mask = params->shrink_axis_mask; @@ -427,40 +434,53 @@ static vsi_bool op_check ) { BEGIN_IO_TYPE_DECL(STRIDED_SLICE, 1, 1) - IO_TYPE(D_F16, D_F16) - IO_TYPE(D_F16, D_I8|Q_DFP) - IO_TYPE(D_F16, D_I16|Q_DFP) - IO_TYPE(D_F16, D_U8|Q_ASYM) - IO_TYPE(D_I8|Q_DFP, D_F16) - IO_TYPE(D_I16|Q_DFP, D_F16) - IO_TYPE(D_U8|Q_ASYM, D_F16) - IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) - IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) - IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) - IO_TYPE(D_F16, D_I8) - IO_TYPE(D_F16, D_I16) - IO_TYPE(D_F16, D_U8) - IO_TYPE(D_I8, D_F16) - IO_TYPE(D_I16, D_F16) - IO_TYPE(D_U8, D_F16) - IO_TYPE(D_I8, D_I8) - IO_TYPE(D_I16, D_I16) - IO_TYPE(D_U8, D_U8) - IO_TYPE(D_F32, D_F32) - IO_TYPE(D_F32, D_BF16) - IO_TYPE(D_BF16, D_F32) - IO_TYPE(D_I32, D_I32) + IO_TYPE(D_F16, D_F16) + IO_TYPE(D_F16, D_I8|Q_DFP) + IO_TYPE(D_F16, D_I16|Q_DFP) + IO_TYPE(D_F16, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_F16) + IO_TYPE(D_I16|Q_DFP, D_F16) + IO_TYPE(D_U8|Q_ASYM, D_F16) + IO_TYPE(D_I8|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_U8|Q_ASYM) + IO_TYPE(D_F32, D_F32) + IO_TYPE(D_BF16, D_BF16) + IO_TYPE(D_F32, D_BF16) + IO_TYPE(D_BF16, D_F32) + IO_TYPE(D_I32, D_I32) + IO_TYPE(D_BOOL8, D_BOOL8) + + /* HW 9.0.1 */ + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_I16|Q_DFP) + IO_TYPE(D_U8|Q_ASYM, D_BF16) + IO_TYPE(D_U8|Q_ASYM, D_F32) + IO_TYPE(D_U8|Q_ASYM, D_I8|Q_DFP) + + IO_TYPE(D_I8|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I8|Q_DFP, D_I16|Q_DFP) + IO_TYPE(D_I8|Q_DFP, D_BF16) + IO_TYPE(D_I8|Q_DFP, D_F32) + + IO_TYPE(D_I16|Q_DFP, D_U8|Q_ASYM) + IO_TYPE(D_I16|Q_DFP, D_I8|Q_DFP) + IO_TYPE(D_I16|Q_DFP, D_BF16) + IO_TYPE(D_I16|Q_DFP, D_F32) + + IO_TYPE(D_F16, D_BF16) + IO_TYPE(D_F16, D_F32) - /* HW 9.0 */ - IO_TYPE(D_BF16, D_BF16) END_IO_TYPE_DECL(STRIDED_SLICE) - if(!VALIDATE_OP_IO_TYPES(STRIDED_SLICE, self, inputs, self->input.num, outputs, self->output.num)) { + if (!VALIDATE_OP_IO_TYPES(STRIDED_SLICE, self, inputs, self->input.num, outputs, self->output.num)) + { char* desc = generate_op_io_types_desc(inputs, self->input.num, outputs, self->output.num); VSILOGE("Inputs/Outputs data type not support: %s", desc); destroy_op_io_types_desc(desc); return FALSE; } + return TRUE; } /* op_check() */ @@ -561,11 +581,12 @@ static vsi_bool op_setup if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { int32_t idx = 0; + uint32_t shape[VSI_NN_MAX_DIM_NUM] = {0}; for (i = 0; i < inputs[0]->attr.dim_num; i++) { vx_int32 begin = 0, end = 1, stride = 1; - vx_int32 input_size = inputs[0]->attr.size[i]; + vx_int32 input_size = (int32_t)inputs[0]->attr.size[i]; vx_int32 output_size = 0; vx_int32 j; @@ -591,7 +612,7 @@ static vsi_bool op_setup { output_size++; } - outputs[0]->attr.size[i] = output_size; + shape[i] = output_size; } outputs[0]->attr.dim_num = 0; for (idx = 0, i = 0; i < inputs[0]->attr.dim_num + params->num_add_axis; i++) @@ -611,7 +632,7 @@ static vsi_bool op_setup } outputs[0]->attr.size[outputs[0]-> - attr.dim_num] = outputs[0]->attr.size[idx ++]; + attr.dim_num] = shape[idx ++]; outputs[0]->attr.dim_num++; } @@ -634,11 +655,11 @@ static vsi_status op_optimize int32_t i = 0; vx_tensor in_view_tensor = NULL; vsi_nn_strided_slice_param *p = &(self->nn_param.strided_slice); - uint32_t start[VSI_NN_MAX_DIM_NUM] = { 0 }; - uint32_t end[VSI_NN_MAX_DIM_NUM] = { 0 }; - int32_t *start_dims = p->lcl2_data->begin_dims; - int32_t *stop_dims = p->lcl2_data->end_dims; - int32_t *stride_dims = p->lcl2_data->stride_dims; + vsi_size_t start[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t end[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_ssize_t start_dims[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_ssize_t stop_dims[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_ssize_t stride_dims[VSI_NN_MAX_DIM_NUM] = { 0 }; vsi_bool is_same_quant_type = FALSE; /* Only forward run stride_slice's optimize */ @@ -647,6 +668,13 @@ static vsi_status op_optimize return status; } + for(i = 0; i< VSI_NN_MAX_DIM_NUM; i++) + { + start_dims[i] = p->lcl2_data->begin_dims[i]; + stop_dims[i] = p->lcl2_data->end_dims[i]; + stride_dims[i] = p->lcl2_data->stride_dims[i]; + } + if (_check_is_same_shape(inputs, start_dims, stop_dims, stride_dims) == FALSE) return status; @@ -658,8 +686,8 @@ static vsi_status op_optimize } /* Create tensor from view */ - memcpy( start, (uint32_t*)start_dims, sizeof( uint32_t ) * VSI_NN_MAX_DIM_NUM ); - memcpy( end, (uint32_t*)stop_dims, sizeof( uint32_t ) * VSI_NN_MAX_DIM_NUM ); + memcpy( start, (vsi_size_t*)start_dims, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM ); + memcpy( end, (vsi_size_t*)stop_dims, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM ); in_view_tensor = vsi_nn_CreateViewTensor(self->graph, start, end, inputs[0]); if( NULL == in_view_tensor ) { @@ -673,7 +701,7 @@ static vsi_status op_optimize is_same_quant_type = _is_same_quant(inputs, outputs); if( NULL != outputs[0]->t || is_same_quant_type == FALSE) { - VSILOGW( "stride slice copy tensor."); + VSILOGI( "stride slice copy tensor."); // Copy old tensor values to the new address. status = copy_tensor_to_view( self, in_view_tensor, outputs[0]); if( VSI_FAILURE == status ) @@ -867,4 +895,3 @@ DEF_OP_REG #ifdef __cplusplus } #endif - diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c b/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c index 642975e..b8b4c1e 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_svdf.c @@ -121,7 +121,7 @@ static vsi_status op_compute if (param.bias == NULL) { vsi_nn_tensor_attr_t attr; - int32_t count = inputs[2]->attr.size[1]; + vsi_size_t count = inputs[2]->attr.size[1]; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); attr.size[0] = count; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c index 5c346bb..3098b6c 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tensorstackconcat.c @@ -42,17 +42,17 @@ static vsi_bool _get_stackconcat_shape ( - const int32_t* shape_x, const int32_t rank_x, - const int32_t* shape_output, const int32_t rank_output, + const vsi_ssize_t* shape_x, const int32_t rank_x, + const vsi_ssize_t* shape_output, const int32_t rank_output, const int32_t axis, - int32_t* out_shape_0, uint32_t* out_rank_0, - int32_t* out_shape_1, uint32_t* out_rank_1, - int32_t* out_shape_output, uint32_t* out_rank_output + vsi_ssize_t* out_shape_0, uint32_t* out_rank_0, + vsi_ssize_t* out_shape_1, uint32_t* out_rank_1, + vsi_ssize_t* out_shape_output, uint32_t* out_rank_output ) { int32_t i = 0; - uint32_t innerSize = 1; - uint32_t outerSize = 1; + vsi_size_t innerSize = 1; + vsi_size_t outerSize = 1; for ( i = 0; i < rank_x; i++) { @@ -97,21 +97,21 @@ static vsi_status op_compute { vsi_status status = VSI_FAILURE; vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; - int32_t shape[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; + vsi_size_t shape[3][VSI_NN_MAX_DIM_NUM] = { { 0 } }; uint32_t rank[3] = {0}; _get_stackconcat_shape( - (int32_t *)inputs[0]->attr.size, inputs[0]->attr.dim_num, - (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num, + (vsi_ssize_t*)inputs[0]->attr.size, inputs[0]->attr.dim_num, + (vsi_ssize_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num, self->nn_param.tensorstackconcat.axis, - shape[0], &rank[0], shape[1], &rank[1], shape[2], &rank[2] ); + (vsi_ssize_t*)shape[0], &rank[0], (vsi_ssize_t*)shape[1], &rank[1], (vsi_ssize_t*)shape[2], &rank[2] ); reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, - inputs[0], (uint32_t*)shape[0], rank[0] ); + inputs[0], shape[0], rank[0] ); reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, - inputs[1], (uint32_t*)shape[1], rank[1] ); + inputs[1], shape[1], rank[1] ); reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, - outputs[0], (uint32_t*)shape[2], rank[2] ); + outputs[0], shape[2], rank[2] ); self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "tensorstackconcat", diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c b/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c index 311e433..fad9cbc 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_unidirectional_sequence_rnn.c @@ -50,9 +50,9 @@ static vsi_bool setup_op_shapes &self->nn_param.unidirectional_sequence_rnn; vsi_nn_tensor_attr_t attr; vsi_nn_internal_tensor_t* output_tensor = NULL; - uint32_t num_units = 0; - uint32_t output_size = 0; - uint32_t batch_size = 0; + vsi_size_t num_units = 0; + vsi_size_t output_size = 0; + vsi_size_t batch_size = 0; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); @@ -145,8 +145,8 @@ static vsi_bool op_setup vsi_nn_tensor_t* tensor = NULL; vsi_nn_tensor_t* input_tensor = NULL; vsi_bool use_virtual_tensor = TRUE; - uint32_t batch_size = 0; - uint32_t time_step = 0; + vsi_size_t batch_size = 0; + vsi_size_t time_step = 0; uint32_t i = 0; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); @@ -183,9 +183,9 @@ static vsi_bool op_setup sizeof(vsi_nn_tensor_t **)); memset( rnncell_reshape_output_tensors, 0x00, time_step * sizeof(vsi_nn_tensor_t **)); - vsi_nn_rnn_split_input_tensor(self, input_tensor, split_output_tensors, time_step, use_virtual_tensor); + vsi_nn_rnn_split_input_tensor(self, input_tensor, split_output_tensors, (uint32_t)time_step, use_virtual_tensor); - vsi_nn_rnn_data_check_aligned(self, split_output_tensors, time_step, use_virtual_tensor); + vsi_nn_rnn_data_check_aligned(self, split_output_tensors, (uint32_t)time_step, use_virtual_tensor); last_step_h_state = inputs[RNN_INPUT_H_STATE]; for( i = 0; i < time_step; i++ ) @@ -196,7 +196,7 @@ static vsi_bool op_setup /* reshape for split output */ output_tensor = vsi_nn_rnn_reshape_split_output(self, - split_output_tensors[i], batch_size, use_virtual_tensor); + split_output_tensors[i], (uint32_t)batch_size, use_virtual_tensor); reshape_output = output_tensor->t; /* rnncell output */ @@ -232,7 +232,7 @@ static vsi_bool op_setup /* reshape output to 3-dims */ output_tensor = vsi_nn_rnn_reshape_cell_output(self, - rnncell_out0, batch_size, use_virtual_tensor); + rnncell_out0, (uint32_t)batch_size, use_virtual_tensor); rnncell_reshape_output_tensors[i] = output_tensor->t; } @@ -247,7 +247,7 @@ static vsi_bool op_setup } /* concat rnncell output, the rnn's output is 3-dims */ - curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, time_step, 1 ); + curr = vsi_nn_internal_new_node( self, VSI_NN_OP_CONCAT, (uint32_t)time_step, 1 ); curr->node->nn_param.concat.axis = 2; for( i = 0; i < time_step; i++ ) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c index 499cdd7..f62ac51 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_unstack.c @@ -86,10 +86,10 @@ static vsi_bool op_setup vsi_nn_internal_tensor_t* input_tensor = NULL; vsi_nn_internal_tensor_t** output_tensors = NULL; vsi_nn_internal_node_t* curr = NULL; - uint32_t* reshape_input_size = NULL; + vsi_size_t* reshape_input_size = NULL; uint32_t *slices = NULL; - uint32_t block_size = 1; - uint32_t block_num = 1; + vsi_size_t block_size = 1; + vsi_size_t block_num = 1; uint32_t axis = 0; uint32_t i, j; @@ -169,8 +169,8 @@ static vsi_bool op_setup input_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 ); - reshape_input_size = (uint32_t *)vsi_nn_internal_new_node_param(curr, - VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + reshape_input_size = (vsi_size_t*)vsi_nn_internal_new_node_param(curr, + VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); reshape_input_size[0] = block_size; reshape_input_size[1] = tensor_num; reshape_input_size[2] = block_num; @@ -201,12 +201,12 @@ static vsi_bool op_setup for (i = 0; i < tensor_num; i++) { - uint32_t* output_size = NULL; + vsi_size_t* output_size = NULL; - output_size = (uint32_t *)vsi_nn_internal_new_node_param(curr, - VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + output_size = (vsi_size_t *)vsi_nn_internal_new_node_param(curr, + VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); - memcpy(output_size, outputs[i]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + memcpy(output_size, outputs[i]->attr.size, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 ); curr->node->nn_param.reshape.size = output_size; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c b/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c index dbe5ff8..34becd3 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_upsample.c @@ -46,14 +46,14 @@ static vsi_bool vsi_nn_upsample_optimize_shape ( vsi_nn_node_t * self, - const int32_t* shape_in0, const int32_t* shape_in1, - const int32_t* shape_out, const size_t rank_in, - int32_t* out_shape_input0, int32_t* out_shape_input1, - int32_t* out_shape_output, uint32_t* out_rank_output + const vsi_ssize_t* shape_in0, const vsi_ssize_t* shape_in1, + const vsi_ssize_t* shape_out, const size_t rank_in, + vsi_ssize_t* out_shape_input0, vsi_ssize_t* out_shape_input1, + vsi_ssize_t* out_shape_output, uint32_t* out_rank_output ) { vsi_bool enable_image_2d = FALSE; - int32_t hwLitimLen = 65536; + vsi_ssize_t hwLitimLen = 65536; if ((2 == self->nn_param.upsample.scale[0]) && (2 == self->nn_param.upsample.scale[1])) @@ -142,7 +142,7 @@ static vsi_status op_compute { vsi_status status = VSI_FAILURE; vsi_nn_tensor_t* reshape_tensors[3] = { NULL }; - int32_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; + vsi_size_t shapes[3][VSI_NN_MAX_DIM_NUM] = {{ 1 }}; uint32_t new_rank = 0; vsi_bool ret; vsi_nn_kernel_param_t * param = NULL; @@ -157,9 +157,9 @@ static vsi_status op_compute param =vsi_nn_kernel_param_create(); ret = vsi_nn_upsample_optimize_shape(self, - (int32_t *)inputs[0]->attr.size, (int32_t *)inputs[1]->attr.size, - (int32_t *)outputs[0]->attr.size, inputs[0]->attr.dim_num, - shapes[0], shapes[1], shapes[2], &new_rank ); + (vsi_ssize_t*)inputs[0]->attr.size, (vsi_ssize_t*)inputs[1]->attr.size, + (vsi_ssize_t*)outputs[0]->attr.size, inputs[0]->attr.dim_num, + (vsi_ssize_t*)shapes[0], (vsi_ssize_t*)shapes[1], (vsi_ssize_t*)shapes[2], &new_rank ); vsi_nn_kernel_param_add_int32( param, "scale_x", scale_x ); vsi_nn_kernel_param_add_int32( param, "scale_y", scale_y ); @@ -168,11 +168,11 @@ static vsi_status op_compute { reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, - inputs[0], (uint32_t*)shapes[0], new_rank ); + inputs[0], shapes[0], new_rank ); reshape_tensors[1] = vsi_nn_reshape_tensor( self->graph, - inputs[1], (uint32_t*)shapes[1], new_rank ); + inputs[1], shapes[1], new_rank ); reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, - outputs[0], (uint32_t*)shapes[2], new_rank ); + outputs[0], shapes[2], new_rank ); self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "upsample", &reshape_tensors[0], _INPUT_NUM, &reshape_tensors[2], _OUTPUT_NUM, param ); @@ -238,8 +238,8 @@ static vsi_bool op_setup vsi_nn_tensor_t ** outputs ) { - uint32_t h; - uint32_t w; + vsi_size_t h; + vsi_size_t w; if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num ) { @@ -247,13 +247,13 @@ static vsi_bool op_setup h = self->nn_param.upsample.size[1]; if (0 == self->nn_param.upsample.scale[0]) { - self->nn_param.upsample.scale[0] = self->nn_param.upsample.size[0] / - inputs[0]->attr.size[0]; + self->nn_param.upsample.scale[0] = (uint32_t)(self->nn_param.upsample.size[0] / + inputs[0]->attr.size[0]); } if (0 == self->nn_param.upsample.scale[1]) { - self->nn_param.upsample.scale[1] = self->nn_param.upsample.size[1] / - inputs[0]->attr.size[1]; + self->nn_param.upsample.scale[1] = (uint32_t)(self->nn_param.upsample.size[1] / + inputs[0]->attr.size[1]); } if ( 0 == self->nn_param.upsample.size[0] ) { diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c b/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c index c79c373..6bb9175 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_upsamplescale.c @@ -190,8 +190,8 @@ static vsi_bool op_setup curr->node->nn_param.resize.type = VSI_NN_INTERPOLATION_NEAREST_NEIGHBOR; curr->node->nn_param.resize.align_corners = FALSE; curr->node->nn_param.resize.half_pixel_centers = FALSE; - curr->node->nn_param.resize.size[0] = inputs[0]->attr.size[0] * stride; - curr->node->nn_param.resize.size[1] = inputs[0]->attr.size[1] * stride; + curr->node->nn_param.resize.size[0] = (int32_t)(inputs[0]->attr.size[0] * stride); + curr->node->nn_param.resize.size[1] = (int32_t)(inputs[0]->attr.size[1] * stride); curr->inputs[0] = inputs[0]; curr->outputs[0] = outputs[0]; diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c b/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c index a0d5e6f..8879471 100644 --- a/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c +++ b/src/tim/vx/internal/src/ops/vsi_nn_op_variable.c @@ -101,7 +101,11 @@ static vsi_status op_optimize vsi_nn_DtypeCompare(&inputs[0]->attr.dtype, &outputs[0]->attr.dtype)) { VSILOGD("Optimize %s, uid %u", vsi_nn_OpGetName(self->op), self->uid); - outputs[0]->t = vxReshapeTensor(inputs[0]->t, (int32_t *)outputs[0]->attr.size, outputs[0]->attr.dim_num); +#ifdef VSI_40BIT_VA_SUPPORT + outputs[0]->t = vxReshapeTensor(inputs[0]->t, outputs[0]->attr.size, outputs[0]->attr.dim_num); +#else + outputs[0]->t = vxReshapeTensor(inputs[0]->t, (int32_t*)outputs[0]->attr.size, outputs[0]->attr.dim_num); +#endif if( NULL == outputs[0]->t ) { VSILOGE("Call vxReshapeTensor fail"); diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c index 470b1a3..2e043b6 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c @@ -79,13 +79,13 @@ static void _try_pack_tensor_data ( vsi_nn_graph_t * graph, vsi_nn_tensor_t * tensor, - uint64_t * p_ofst, - uint64_t * p_sz + vsi_size_t * p_ofst, + vsi_size_t * p_sz ) { long ofst; size_t cnt; - uint32_t bytes; + vsi_size_t bytes; uint8_t * data; if( NULL == s_dfile_hndl || NULL == tensor @@ -102,7 +102,7 @@ static void _try_pack_tensor_data } else { - *p_ofst = (uint64_t)ofst; + *p_ofst = (vsi_size_t)ofst; data = vsi_nn_ConvertTensorToData( graph, tensor ); bytes = vsi_nn_GetTensorSize( tensor->attr.size, tensor->attr.dim_num, tensor->attr.dtype.vx_type ); @@ -111,7 +111,7 @@ static void _try_pack_tensor_data cnt = fwrite( data, (size_t)bytes, 1, s_dfile_hndl ); if( cnt != 1 ) { - VSILOGW( "Write tensor bytes(%zu/%d)", cnt, 1 ); + VSILOGW( "Write tensor bytes(%"VSI_SIZE_T_SPECIFIER"/%d)", (vsi_size_t)cnt, 1 ); } if( cnt > 0 ) { @@ -430,6 +430,11 @@ static _op_param_gen_t s_op_gen[] = /* GROUPED_CONV1D */ NULL, /* SCATTER_ND_UPDATE */ NULL, /* GELU */ NULL, + /* CONV2D_LSTM */ NULL, + /* CONV2D_LSTM_CELL */ NULL, + /* GRU */ NULL, + /* GRUCELL */ NULL, + /* GRUCELL_ACTIVATION */ NULL, }; _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c ); @@ -442,8 +447,8 @@ void vsi_nn_GenGraphCCode { uint32_t i; uint32_t j; - uint64_t sz; - uint64_t ofst; + vsi_size_t sz; + vsi_size_t ofst; vsi_nn_node_t * node; vsi_nn_node_id_t node_id ; vsi_nn_node_id_t * sorted_nodes; @@ -495,7 +500,7 @@ void vsi_nn_GenGraphCCode tensor_id ); if( sz > 0 ) { - _write_code( "load_data_to_tensor( tensor[%u], %llu, %llu );", + _write_code( "load_data_to_tensor( tensor[%u], %"VSI_SIZE_T_SPECIFIER", %"VSI_SIZE_T_SPECIFIER" );", tensor_id, ofst, sz ); } } diff --git a/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c b/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c index 36060ea..2f6aec6 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_constraint_check.c @@ -164,7 +164,6 @@ vsi_bool validate_op_io_types node_io_signature_t* sig = _get_op_signature(inputs, inputs_num, outputs, outputs_num, op_constraint_reg); - VSILOGD("Validate [%s]", name); if(sig && op_constraint_reg && op_constraint_reg->types) { for(i = 0; i < op_constraint_reg->io_types_item_count; i++) { const uint8_t* curr = ((const uint8_t*)op_constraint_reg->types) \ diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c index e80ef51..384981b 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_dtype.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype.c @@ -168,7 +168,7 @@ DEF_DTYPE_CONVERT_QUANTIZE( asymm8, uint8_t, vsi_rtne, 0, UCHAR_MAX ) vsi_bool vsi_nn_dtype_convert_float_to_quantize_symm8_perchannel ( const float * buffer, size_t size, - const int32_t * shape, size_t rank, + const vsi_size_t * shape, size_t rank, const float * scale, size_t scale_size, const int32_t * zero_point, size_t zero_point_size, int32_t channel_dim, @@ -186,7 +186,7 @@ vsi_bool vsi_nn_dtype_convert_float_to_quantize_symm8_perchannel vsi_bool vsi_nn_dtype_convert_quantize_symm8_perchannel_to_float ( const int8_t * buffer, size_t size, - const int32_t * shape, size_t rank, + const vsi_size_t * shape, size_t rank, const float * scale, size_t scale_size, const int32_t * zero_point, size_t zero_point_size, int32_t channel_dim, @@ -317,7 +317,7 @@ vsi_bool vsi_nn_dtype_convert_float_to_quantize_symm_perchannel ( const float * buffer, size_t size, vsi_nn_kernel_dtype_e dtype, - const int32_t * shape, size_t rank, + const vsi_size_t * shape, size_t rank, const float * scale, size_t scale_size, const int32_t * zero_point, size_t zero_point_size, int32_t channel_dim, @@ -460,7 +460,7 @@ vsi_bool vsi_nn_dtype_convert_quantize_symm_perchannel_to_float ( const void * buffer, size_t size, vsi_nn_kernel_dtype_e dtype, - const int32_t * shape, size_t rank, + const vsi_size_t * shape, size_t rank, const float * scale, size_t scale_size, const int32_t * zero_point, size_t zero_point_size, int32_t channel_dim, diff --git a/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c b/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c index 4b827d4..6144845 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_dtype_util.c @@ -138,7 +138,6 @@ uint16_t vsi_nn_Fp32ToBFp16 return fp32_to_bfp16(in); } /* vsi_nn_Fp32ToFp16() */ - vsi_status vsi_nn_IntegerConvert ( const void * src, @@ -237,24 +236,24 @@ vsi_status vsi_nn_Float32ToDtype return float32_to_dtype(src, dst, dst_dtype); } /* vsi_nn_Float32ToDtype() */ -int32_t vsi_nn_DtypeConvertRawData +vsi_size_t vsi_nn_DtypeConvertRawData ( uint8_t * src, - int32_t src_bytes, + vsi_size_t src_bytes, const vsi_nn_dtype_t * src_dtype, uint8_t * dst, - int32_t dst_bytes, + vsi_size_t dst_bytes, const vsi_nn_dtype_t * dst_dtype ) { uint8_t * src_iter; uint8_t * dst_iter; - int32_t count; - int32_t elements; - int32_t src_type_bytes; - int32_t dst_type_bytes; - int32_t target_bytes; - int32_t i; + vsi_size_t count; + vsi_size_t elements; + vsi_size_t src_type_bytes; + vsi_size_t dst_type_bytes; + vsi_size_t target_bytes; + vsi_size_t i; vsi_status status; count = 0; if( NULL == src || NULL == dst || NULL == src_dtype ) @@ -264,11 +263,12 @@ int32_t vsi_nn_DtypeConvertRawData src_type_bytes = vsi_nn_TypeGetBytes( src_dtype->vx_type ); dst_type_bytes = vsi_nn_TypeGetBytes( dst_dtype->vx_type ); - elements = (int32_t)( src_bytes / src_type_bytes ); + elements = src_bytes / src_type_bytes; target_bytes = dst_type_bytes * elements; if( dst_bytes < target_bytes ) { - VSILOGW("Wrong dest buffer size: %d, require: %d", dst_bytes, target_bytes); + VSILOGW("Wrong dest buffer size: %"VSI_SIZE_T_SPECIFIER", require: %"VSI_SIZE_T_SPECIFIER"", + dst_bytes, target_bytes); return count; } src_iter = src; @@ -287,13 +287,13 @@ int32_t vsi_nn_DtypeConvertRawData return count; } /* vsi_nn_DtypeConvertRawData() */ -int32_t vsi_nn_DtypeConvertRawDataToFloat32 +vsi_size_t vsi_nn_DtypeConvertRawDataToFloat32 ( uint8_t * src, - int32_t src_bytes, + vsi_size_t src_bytes, const vsi_nn_dtype_t * src_dtype, float * dst, - int32_t dst_size + vsi_size_t dst_size ) { vsi_nn_dtype_t dst_dtype; @@ -304,12 +304,12 @@ int32_t vsi_nn_DtypeConvertRawDataToFloat32 (uint8_t *)dst, dst_size * sizeof( float ), &dst_dtype ); } /*vsi_nn_DtypeConvertRawDataToFloat32()*/ -int32_t vsi_nn_DtypeConvertFloat32ToRawData +vsi_size_t vsi_nn_DtypeConvertFloat32ToRawData ( float * src, - int32_t src_size, + vsi_size_t src_size, uint8_t * dst, - int32_t dst_bytes, + vsi_size_t dst_bytes, const vsi_nn_dtype_t * dst_dtype ) { @@ -357,8 +357,9 @@ vsi_bool vsi_nn_QuantCheck weight_qnt_type = weight->attr.dtype.qnt_type; weight_dtype = weight->attr.dtype.vx_type; - //do not check quant parammeters if types of input/weight is hybrid combinaton - if(input_dtype != weight_dtype || input_qnt_type != weight_qnt_type) + //do not check quant parammeters if types of input/weight/bias is hybrid combinaton + if( input_dtype != weight_dtype || input_qnt_type != weight_qnt_type || + (bias && bias->attr.dtype.qnt_type != input_qnt_type) ) { return ret; } @@ -458,12 +459,13 @@ vsi_status vsi_nn_vxConvertTensorToFloat32Data vx_tensor tensor, vsi_nn_tensor_attr_t *attr, float *f32_data, - uint32_t f32_data_sz + vsi_size_t f32_data_sz ) { vsi_status status; uint8_t *data; - uint32_t elements,stride; + vsi_size_t elements; + uint32_t stride; vsi_nn_tensor_attr_t tensor_attr, *_attr; data = NULL; @@ -488,7 +490,8 @@ vsi_status vsi_nn_vxConvertTensorToFloat32Data stride = vsi_nn_TypeGetBytes(_attr->dtype.vx_type); if(f32_data_sz != elements * sizeof(float)) { - VSILOGE("buffer sz %u != required sz %u", f32_data_sz, elements * sizeof(float)); + VSILOGE("buffer sz %"VSI_SIZE_T_SPECIFIER" != required sz %"VSI_SIZE_T_SPECIFIER"", + f32_data_sz, elements * sizeof(float)); return status; } data = vsi_nn_vxCopyTensorToData(context, tensor, _attr); @@ -516,12 +519,13 @@ vsi_status vsi_nn_vxConvertFloat32DataToTensor vx_tensor tensor, vsi_nn_tensor_attr_t *attr, float *f32_data, - uint32_t f32_data_sz + vsi_size_t f32_data_sz ) { vsi_status status; uint8_t *data; - uint32_t elements,stride; + vsi_size_t elements; + uint32_t stride; vsi_nn_tensor_attr_t tensor_attr, *_attr; data = NULL; @@ -546,7 +550,8 @@ vsi_status vsi_nn_vxConvertFloat32DataToTensor stride = vsi_nn_GetTypeBytes(_attr->dtype.vx_type); if(f32_data_sz != elements * sizeof(float)) { - VSILOGE("buffer sz %u != required sz %u", f32_data_sz, elements * sizeof(float)); + VSILOGE("buffer sz %"VSI_SIZE_T_SPECIFIER" != required sz %"VSI_SIZE_T_SPECIFIER"", + f32_data_sz, elements * sizeof(float)); return status; } @@ -568,4 +573,3 @@ final: } return status; } /* vsi_nn_vxConvertFloat32DataToTensor() */ - diff --git a/src/tim/vx/internal/src/utils/vsi_nn_math.c b/src/tim/vx/internal/src/utils/vsi_nn_math.c index 19350a8..65878b6 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_math.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_math.c @@ -33,20 +33,20 @@ static void _compute_stride ( - uint32_t * shape, - uint32_t dim_num, - uint32_t * stride + vsi_size_t * shape, + vsi_size_t dim_num, + vsi_size_t * stride ); static void _compute_stride ( - uint32_t * shape, - uint32_t dim_num, - uint32_t * stride + vsi_size_t * shape, + vsi_size_t dim_num, + vsi_size_t * stride ) { - int i; - uint32_t s; + vsi_ssize_t i; + vsi_size_t s; s = 1; for( i = dim_num - 1; i >= 0; i -- ) { @@ -59,21 +59,21 @@ void vsi_nn_Transpose ( uint8_t * dst, uint8_t * data, - uint32_t * shape, - uint32_t dim_num, - uint32_t * perm, + vsi_size_t * shape, + vsi_size_t dim_num, + vsi_size_t * perm, vsi_nn_type_e type ) { - uint32_t i; - uint32_t i_dst; - uint32_t i_org; - uint32_t i_t; - uint32_t size; + vsi_size_t i; + vsi_size_t i_dst; + vsi_size_t i_org; + vsi_size_t i_t; + vsi_size_t size; uint32_t unit_bytes; - uint32_t org_stride[VSI_NN_MAX_DIM_NUM]; - uint32_t dst_stride[VSI_NN_MAX_DIM_NUM]; - uint32_t dst_shape[VSI_NN_MAX_DIM_NUM]; + vsi_size_t org_stride[VSI_NN_MAX_DIM_NUM]; + vsi_size_t dst_stride[VSI_NN_MAX_DIM_NUM]; + vsi_size_t dst_shape[VSI_NN_MAX_DIM_NUM]; if( NULL == data || NULL == dst || NULL == shape || NULL == perm || 0 == dim_num || dim_num > VSI_NN_MAX_DIM_NUM ) @@ -116,24 +116,24 @@ void vsi_nn_Permute ( uint8_t * dst, uint8_t * data, - uint32_t * shape, - uint32_t dim_num, - uint32_t * perm, + vsi_size_t * shape, + vsi_size_t dim_num, + vsi_size_t * perm, vsi_nn_type_e type ) { uint32_t unit_bytes, i; - uint32_t org_stride[VSI_NN_MAX_DIM_NUM] = {0}; - uint32_t dst_stride[VSI_NN_MAX_DIM_NUM] = {0}; - uint32_t dst_shape[VSI_NN_MAX_DIM_NUM] = {0}; - uint32_t dim_stack[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t org_stride[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t dst_stride[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t dst_shape[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t dim_stack[VSI_NN_MAX_DIM_NUM] = {0}; uint8_t * in_addr_stack[VSI_NN_MAX_DIM_NUM] = {NULL}; uint8_t * out_addr_stack[VSI_NN_MAX_DIM_NUM] = {NULL}; uint8_t * in_addr_tmp = NULL; uint8_t * out_addr_tmp = NULL; uint32_t current = 0; vsi_bool back = FALSE; - uint32_t layer = dim_num - 1; + vsi_size_t layer = dim_num - 1; if( NULL == data || NULL == dst || NULL == shape || NULL == perm || 0 == dim_num || dim_num > VSI_NN_MAX_DIM_NUM ) @@ -215,14 +215,14 @@ void vsi_nn_Permute void vsi_nn_SqueezeShape ( - uint32_t * shape, - uint32_t * dim_num + vsi_size_t * shape, + vsi_size_t * dim_num ) { - int i; - int origin_count; - int count; - int start; + vsi_size_t i; + vsi_size_t origin_count; + vsi_size_t count; + vsi_size_t start; count = *dim_num; origin_count = count; if( 1 == count ) @@ -238,7 +238,7 @@ void vsi_nn_SqueezeShape } else if( i > start ) { - memmove( &shape[start], &shape[i], (count - i) * sizeof( uint32_t ) ); + memmove( &shape[start], &shape[i], (count - i) * sizeof(vsi_size_t) ); count -= i - start; start += i - start; } @@ -248,17 +248,17 @@ void vsi_nn_SqueezeShape } } *dim_num = count; - memset( &shape[count], 0, sizeof( uint32_t ) * ( origin_count - count ) ); + memset( &shape[count], 0, sizeof(vsi_size_t) * ( origin_count - count ) ); } /* vsi_nn_SqueezeShape() */ -uint32_t vsi_nn_ShapeProduct +vsi_size_t vsi_nn_ShapeProduct ( - uint32_t * shape, - uint32_t dim_num + vsi_size_t * shape, + vsi_size_t dim_num ) { uint32_t i; - uint32_t res; + vsi_size_t res; res = 1; for ( i = 0; i < dim_num; i++ ) { @@ -269,12 +269,12 @@ uint32_t vsi_nn_ShapeProduct void vsi_nn_InvertShape ( - uint32_t * in, - uint32_t dim_num, - uint32_t * out + vsi_size_t * in, + vsi_size_t dim_num, + vsi_size_t * out ) { - uint32_t i; + vsi_size_t i; for ( i = 0; i < dim_num; i++ ) { out[i] = in[dim_num - 1 - i]; @@ -283,12 +283,12 @@ void vsi_nn_InvertShape void vsi_nn_InvertPermuteShape ( - uint32_t * in, - uint32_t dim_num, - uint32_t * out + vsi_size_t * in, + vsi_size_t dim_num, + vsi_size_t * out ) { - uint32_t i; + vsi_size_t i; for ( i = 0; i < dim_num; i++ ) { out[in[i]] = i; diff --git a/src/tim/vx/internal/src/utils/vsi_nn_shape_util.c b/src/tim/vx/internal/src/utils/vsi_nn_shape_util.c index 6a4be8d..bd327ec 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_shape_util.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_shape_util.c @@ -25,12 +25,13 @@ #include #include "vsi_nn_log.h" #include "utils/vsi_nn_shape_util.h" +#include "vsi_nn_types.h" void vsi_nn_shape_get_stride ( - const int32_t * shape, - size_t rank, - size_t * out_stride + const vsi_size_t * shape, + vsi_size_t rank, + vsi_size_t * out_stride ) { uint32_t i; @@ -46,13 +47,13 @@ void vsi_nn_shape_get_stride } } /* vsi_nn_shape_get_stride() */ -size_t vsi_nn_shape_get_size +vsi_size_t vsi_nn_shape_get_size ( - const int32_t * shape, - size_t rank + const vsi_size_t * shape, + vsi_size_t rank ) { - size_t size = 0; + vsi_size_t size = 0; uint32_t i; if( !shape ) { diff --git a/src/tim/vx/internal/src/utils/vsi_nn_tensor_op.c b/src/tim/vx/internal/src/utils/vsi_nn_tensor_op.c index febd192..48babb3 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_tensor_op.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_tensor_op.c @@ -37,13 +37,13 @@ static void _compute_stride ( - uint32_t * shape, - uint32_t dim_num, - uint32_t * stride + vsi_size_t * shape, + vsi_size_t dim_num, + vsi_size_t * stride ) { - uint32_t i; - uint32_t s; + vsi_size_t i; + vsi_size_t s; s = 1; for( i = 0; i < dim_num; i ++ ) { @@ -64,11 +64,12 @@ vsi_nn_tensor_t* vsi_nn_Concat int32_t k; uint8_t* buffer = NULL; uint8_t* tmp = NULL; - size_t total_bytes = 0; - size_t tensor_size = 0; - size_t offset = 0, src = 0, dst = 0; - uint32_t* strides = NULL; - uint32_t* dst_strides = NULL; + vsi_size_t total_bytes = 0; + vsi_size_t tensor_size = 0; + vsi_size_t m; + vsi_size_t offset = 0, src = 0, dst = 0; + vsi_size_t* strides = NULL; + vsi_size_t* dst_strides = NULL; uint32_t type_bytes = 0; vsi_nn_tensor_attr_t output_attr; vsi_nn_tensor_t* tensor_out = NULL; @@ -87,7 +88,7 @@ vsi_nn_tensor_t* vsi_nn_Concat } memset( &output_attr, 0, sizeof(vsi_nn_tensor_attr_t) ); memcpy( &output_attr.dtype, &tensors[0]->attr.dtype, sizeof(vsi_nn_dtype_t) ); - memcpy( output_attr.size, tensors[0]->attr.size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM ); + memcpy( output_attr.size, tensors[0]->attr.size, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM ); output_attr.dim_num = tensors[0]->attr.dim_num; for( i = 1; i < tensor_num; i ++ ) @@ -116,8 +117,8 @@ vsi_nn_tensor_t* vsi_nn_Concat total_bytes = vsi_nn_GetTensorSize( output_attr.size, output_attr.dim_num, output_attr.dtype.vx_type ); buffer = (uint8_t*)malloc( total_bytes ); - strides = (uint32_t*)malloc( sizeof(uint32_t) * tensors[0]->attr.dim_num ); - dst_strides = (uint32_t*)malloc( sizeof(uint32_t) * tensors[0]->attr.dim_num ); + strides = (vsi_size_t*)malloc( sizeof(vsi_size_t) * tensors[0]->attr.dim_num ); + dst_strides = (vsi_size_t*)malloc( sizeof(vsi_size_t) * tensors[0]->attr.dim_num ); if (!buffer || !strides || !dst_strides) { VSILOGW("Out of memroy."); @@ -136,9 +137,9 @@ vsi_nn_tensor_t* vsi_nn_Concat goto concat_error; } _compute_stride(tensors[i]->attr.size, tensors[i]->attr.dim_num, strides); - for( j = 0; j < tensor_size; j ++ ) + for( m = 0; m < tensor_size; m ++ ) { - src = j; + src = m; dst = 0; for( k = tensors[0]->attr.dim_num - 1; k >= 0; k -- ) { @@ -146,7 +147,7 @@ vsi_nn_tensor_t* vsi_nn_Concat src %= strides[k]; } dst += offset; - src = j; + src = m; memcpy( &buffer[dst * type_bytes], &tmp[src * type_bytes], type_bytes ); } vsi_nn_safe_free( tmp ); @@ -178,10 +179,11 @@ vsi_nn_tensor_t* vsi_nn_ConvertTensorDtype ) { vsi_status status = VSI_SUCCESS; - uint32_t i = 0, src_stride = 0, dst_stride = 0; - uint32_t sz = 0; + vsi_size_t i = 0; + uint32_t src_stride = 0, dst_stride = 0; + vsi_size_t sz = 0; uint8_t* src_buf = NULL; - uint32_t dst_buf_sz = 0; + vsi_size_t dst_buf_sz = 0; uint8_t* dst_buf = NULL; vsi_nn_tensor_attr_t dst_attr; vsi_nn_tensor_t* dst_tensor = NULL; diff --git a/src/tim/vx/internal/src/utils/vsi_nn_util.c b/src/tim/vx/internal/src/utils/vsi_nn_util.c index 9ac442d..4b6fded 100644 --- a/src/tim/vx/internal/src/utils/vsi_nn_util.c +++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c @@ -103,10 +103,10 @@ _compiler_assert(VX_STATUS_MIN == -25, VX_STATUS_VALUE_CHANGED); static const int16_t vx_status_desc_cnt = _cnt_of_array( vx_status_desc ); -static uint32_t _compute_stride_rounding +static vsi_size_t _compute_stride_rounding ( - uint32_t out, - uint32_t stride, + vsi_size_t out, + vsi_size_t stride, vsi_nn_round_type_e rounding ) { @@ -121,17 +121,17 @@ static uint32_t _compute_stride_rounding return out; } -static uint32_t _compute_padding +static vsi_size_t _compute_padding ( - uint32_t in_size, - uint32_t ksize, - uint32_t stride, - uint32_t dilation_rate, - uint32_t out_size + vsi_size_t in_size, + vsi_size_t ksize, + vsi_size_t stride, + vsi_size_t dilation_rate, + vsi_size_t out_size ) { - uint32_t effective_ksize; - int32_t padding; + vsi_size_t effective_ksize; + vsi_ssize_t padding; effective_ksize = (ksize - 1) * dilation_rate + 1; padding = (out_size - 1) * stride + effective_ksize - in_size; return vsi_nn_max(padding, 0); @@ -140,12 +140,12 @@ static uint32_t _compute_padding uint8_t * vsi_nn_LoadBinaryData ( const char * filename, - uint32_t * sz + vsi_size_t * sz ) { uint8_t * data; - uint32_t fsize; - size_t cnt; + vsi_size_t fsize; + vsi_size_t cnt; FILE * fp; fp = fopen( filename, "rb" ); @@ -154,7 +154,7 @@ uint8_t * vsi_nn_LoadBinaryData return NULL; } fseek( fp, 0L, SEEK_END ); - fsize = (uint32_t)ftell( fp ); + fsize = (vsi_size_t)ftell( fp ); fseek( fp, 0L, SEEK_SET ); data = (uint8_t *)malloc( fsize ); cnt = 0; @@ -164,9 +164,9 @@ uint8_t * vsi_nn_LoadBinaryData } else { - while( (uint32_t)cnt < fsize ) + while( cnt < fsize ) { - cnt += fread( &data[cnt], 1, fsize, fp ); + cnt += (vsi_size_t)fread( &data[cnt], 1, fsize, fp ); if( cnt == 0 ) { break; @@ -177,15 +177,15 @@ uint8_t * vsi_nn_LoadBinaryData fclose( fp ); if( NULL != sz ) { - *sz = (uint32_t)cnt; + *sz = cnt; } return data; } /* vsi_nn_LoadBinaryData() */ -uint32_t vsi_nn_GetStrideSize +vsi_size_t vsi_nn_GetStrideSize ( vsi_nn_tensor_attr_t * attr, - uint32_t * stride + vsi_size_t * stride ) { @@ -197,16 +197,16 @@ uint32_t vsi_nn_GetStrideSize return vsi_nn_GetStrideSizeBySize(attr->size, attr->dim_num, attr->dtype.vx_type, stride); } /* vsi_nn_GetStrideSize() */ -uint32_t vsi_nn_GetStrideSizeBySize +vsi_size_t vsi_nn_GetStrideSizeBySize ( - uint32_t * size, - uint32_t dim_num, + vsi_size_t * size, + vsi_size_t dim_num, vsi_nn_type_e type, - uint32_t * stride + vsi_size_t * stride ) { - uint32_t total_bytes; - uint32_t i; + vsi_size_t total_bytes; + vsi_size_t i; if( NULL == size || NULL == stride ) { @@ -228,10 +228,10 @@ uint32_t vsi_nn_GetStrideSizeBySize return total_bytes; } /* vsi_nn_GetStrideSizeBySize() */ -uint32_t vsi_nn_GetTotalBytesBySize +vsi_size_t vsi_nn_GetTotalBytesBySize ( - uint32_t * size, - uint32_t dim_num, + vsi_size_t * size, + vsi_size_t dim_num, vsi_nn_type_e type ) { @@ -328,17 +328,17 @@ void vsi_nn_UpdateTensorDims } /* vsi_nn_UpdateTensorDims() */ -uint32_t vsi_nn_ComputeFilterSize +vsi_size_t vsi_nn_ComputeFilterSize ( - uint32_t i_size, - uint32_t ksize, + vsi_size_t i_size, + vsi_size_t ksize, uint32_t * pad, uint32_t stride, uint32_t dilation, vsi_nn_round_type_e rounding ) { - uint32_t out; + vsi_size_t out; if( 0 == stride ) { if (i_size == ksize) { @@ -358,16 +358,16 @@ uint32_t vsi_nn_ComputeFilterSize return out; } /* vsi_nn_ComputeFilterSize() */ -uint32_t vsi_nn_compute_filter_shape +vsi_size_t vsi_nn_compute_filter_shape ( vsi_nn_pad_e padding_type, - uint32_t image_size, - uint32_t ksize, + vsi_size_t image_size, + vsi_size_t ksize, uint32_t stride, uint32_t dilation_rate ) { - uint32_t effective_ksize; + vsi_size_t effective_ksize; effective_ksize = (ksize - 1) * dilation_rate + 1; switch (padding_type) { @@ -382,16 +382,16 @@ uint32_t vsi_nn_compute_filter_shape void vsi_nn_compute_padding ( - uint32_t * in_shape, - uint32_t * ksize, + vsi_size_t * in_shape, + vsi_size_t * ksize, uint32_t * stride, uint32_t * dilation, vsi_nn_pad_e pad_type, - uint32_t * out_pad + vsi_size_t * out_pad ) { - uint32_t out_w, out_h; - uint32_t pad_w, pad_h; + vsi_size_t out_w, out_h; + vsi_size_t pad_w, pad_h; uint32_t dilation_w, dilation_h; if (NULL == in_shape || NULL == ksize || NULL == stride || NULL == out_pad) @@ -425,13 +425,13 @@ void vsi_nn_compute_padding void vsi_nn_ComputePadWithPadType ( - uint32_t * in_shape, + vsi_size_t * in_shape, uint32_t in_dim_num, - uint32_t * ksize, + vsi_size_t * ksize, uint32_t * stride, vsi_nn_pad_e pad_type, vsi_nn_round_type_e rounding, - uint32_t * out_pad + vsi_size_t * out_pad ) { vsi_nn_compute_padding(in_shape, ksize, stride, NULL, pad_type, out_pad); @@ -439,16 +439,16 @@ void vsi_nn_ComputePadWithPadType void vsi_nn_compute_padding_conv1d ( - uint32_t * in_shape, - uint32_t * ksize, + vsi_size_t * in_shape, + vsi_size_t * ksize, uint32_t * stride, uint32_t * dilation, vsi_nn_pad_e pad_type, - uint32_t * out_pad + vsi_size_t * out_pad ) { - uint32_t out_h; - uint32_t pad_h; + vsi_size_t out_h; + vsi_size_t pad_h; uint32_t dilation_h; if (NULL == in_shape || NULL == ksize || NULL == stride || NULL == out_pad) @@ -476,13 +476,13 @@ void vsi_nn_compute_padding_conv1d void vsi_nn_ComputePadWithPadTypeForConv1D ( - uint32_t * in_shape, + vsi_size_t * in_shape, uint32_t in_dim_num, - uint32_t * ksize, + vsi_size_t * ksize, uint32_t * stride, vsi_nn_pad_e pad_type, vsi_nn_round_type_e rounding, - uint32_t * out_pad + vsi_size_t * out_pad ) { vsi_nn_compute_padding_conv1d(in_shape, ksize, stride, NULL, pad_type, out_pad); @@ -536,10 +536,10 @@ vsi_bool vsi_nn_CreateTensorGroup ) { vsi_bool ret; - uint32_t sz; + vsi_size_t sz; uint32_t i; - uint32_t start[VSI_NN_MAX_DIM_NUM]; - uint32_t end[VSI_NN_MAX_DIM_NUM]; + vsi_size_t start[VSI_NN_MAX_DIM_NUM]; + vsi_size_t end[VSI_NN_MAX_DIM_NUM]; vsi_nn_tensor_attr_t attr; if( NULL == graph || NULL == in_tensor @@ -561,7 +561,7 @@ vsi_bool vsi_nn_CreateTensorGroup memcpy( &attr, &in_tensor->attr, sizeof( attr ) ); attr.size[axis] = sz; - memset( start, 0, sizeof( uint32_t ) * VSI_NN_MAX_DIM_NUM ); + memset( start, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM ); end[0] = in_tensor->attr.size[0]; end[1] = in_tensor->attr.size[1]; end[2] = in_tensor->attr.size[2]; @@ -576,9 +576,9 @@ vsi_bool vsi_nn_CreateTensorGroup if ( attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_SYMMETRIC ) { attr.dtype.scales = in_tensor->attr.dtype.scales + sz * i; - attr.dtype.scale_dim = sz; + attr.dtype.scale_dim = (int32_t)sz; attr.dtype.zero_points = in_tensor->attr.dtype.zero_points + sz * i; - attr.dtype.zero_points_dim = sz; + attr.dtype.zero_points_dim = (int32_t)sz; } #endif out_tensors[i] = vsi_nn_CreateTensor( graph, &attr ); @@ -605,8 +605,8 @@ vsi_bool vsi_nn_CreateTensorGroup uint32_t vsi_nn_ShapeToString ( - uint32_t * shape, - uint32_t dim_num, + vsi_size_t * shape, + vsi_size_t dim_num, char * buf, uint32_t buf_sz, vsi_bool for_print @@ -614,7 +614,7 @@ uint32_t vsi_nn_ShapeToString { #define _PRINT_FMT (0) #define _NOT_PRINT_FMT (1) - uint32_t s; + vsi_size_t s; uint32_t count; const char * all_fmt[] = {" %d,", "%d_" }; const char * fmt; @@ -710,26 +710,35 @@ vsi_bool vsi_nn_CheckFilePath return FALSE; } /* vsi_nn_CheckFilePath() */ +/* + * AlignedBuffer is figured as bellow: + * | margin start at raw_addr | aligned_header | begin_guard | + * data start at align_addr | end_guard | +*/ +#define BEGIN_GUARD_SIZE 64 +#define END_GUARD_SIZE 64 typedef struct { uint8_t* raw_addr; + uint8_t begin_guard[BEGIN_GUARD_SIZE]; } aligned_header; uint8_t * vsi_nn_MallocAlignedBuffer ( - uint32_t mem_size, - uint32_t align_start_size, - uint32_t align_block_size + vsi_size_t mem_size, + vsi_size_t align_start_size, + vsi_size_t align_block_size ) { - uint32_t sz; + vsi_size_t sz; uintptr_t temp; uint8_t* raw_addr; uint8_t* p; uint8_t* align_addr; aligned_header* header; - sz = sizeof(aligned_header) + mem_size + align_start_size + align_block_size; + sz = sizeof(aligned_header) + mem_size + + align_start_size + align_block_size + END_GUARD_SIZE; raw_addr = (uint8_t *)malloc( sz * sizeof( uint8_t ) ); memset(raw_addr, 0, sizeof( uint8_t ) * sz); p = raw_addr + sizeof(aligned_header); @@ -761,7 +770,7 @@ void vsi_nn_FreeAlignedBuffer vsi_bool vsi_nn_IsBufferAligned ( uint8_t * buf, - uint32_t align_start_size + vsi_size_t align_start_size ) { uintptr_t temp; @@ -778,7 +787,7 @@ void vsi_nn_FormatToString ( vsi_nn_tensor_t *tensor, char *buf, - uint32_t buf_sz + vsi_size_t buf_sz ) { switch(tensor->attr.dtype.vx_type) @@ -858,250 +867,7 @@ int32_t vsi_nn_partition return low; } - -/* Greatest Common Divisor*/ -static vsi_bool vsi_nn_GetDataDivisors - ( - vx_uint32 input_value, - vx_uint32 *divisors, - vx_uint32 gcd - ) -{ - vx_uint32 i = 0; -#define VSI_NN_MAX_IMAGE_WIDTH (65536) - for (i = vsi_nn_min(input_value, VSI_NN_MAX_IMAGE_WIDTH - 1); i > 0; i --) - { - if ((i % gcd == 0) && (input_value % i == 0)) - { - *divisors = i; - - return TRUE; - } - } -#undef VSI_NN_MAX_IMAGE_WIDTH - return FALSE; -} - -void vsi_nn_OptimizedEltOPShape - ( - vsi_nn_tensor_t * input, - uint32_t sizes[VSI_NN_MAX_DIM_NUM], - uint32_t * num_of_dims - ) -{ - uint32_t element_count = 0; - uint32_t i = 0; -#define VSI_NN_MAX_IMAGE_WIDTH (65536) - element_count = vsi_nn_GetElementNum(input); - - for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++) - { - sizes[i] = 1; - } - - if (element_count < VSI_NN_MAX_IMAGE_WIDTH) - { - sizes[0] = element_count; - *num_of_dims = 2; - } - else - { - vx_uint32 divisors = 1; - for (i = 0; i < 2; i++) - { - divisors = 1; - vsi_nn_GetDataDivisors(element_count, &divisors, 1); - if (1 == divisors) - { - divisors = element_count; - } - sizes[i] = divisors; - element_count = element_count / divisors; - } - - sizes[2] = element_count; - if (1 == sizes[2]) - { - *num_of_dims = 2; - } - else - { - *num_of_dims = 3; - } - } -#undef VSI_NN_MAX_IMAGE_WIDTH -} - -vsi_bool vsi_nn_OptimizedEltWiseOPShape - ( - vsi_nn_tensor_t * input0, - vsi_nn_tensor_t * input1, - vsi_nn_tensor_t * output, - uint32_t sizes0[VSI_NN_MAX_DIM_NUM], - uint32_t sizes1[VSI_NN_MAX_DIM_NUM], - uint32_t sizes2[VSI_NN_MAX_DIM_NUM], - uint32_t * dim_num - ) -{ - vsi_bool status = TRUE; - uint32_t i = 0; - uint32_t cnt = 0; - uint32_t dims = 0; - uint32_t element_count0 = 0; - uint32_t element_count1 = 0; - vsi_bool enable_broadcast = FALSE; - vsi_bool enable_broadcast1 = FALSE; - uint32_t broadcast_Bits = 0; - - element_count0 = vsi_nn_GetElementNum(input0); - element_count1 = vsi_nn_GetElementNum(input1); - - if (element_count0 == 1 || element_count1 == 1) - { - enable_broadcast1 = TRUE; - } - - /*************step 1:init tensor shape*****************/ - for (i = 0; i < VSI_NN_MAX_DIM_NUM; i++) - { - sizes0[i] = 1; - sizes1[i] = 1; - sizes2[i] = 1; - } - - /*************step 2:squeeze tensor shape*****************/ - for (i = 0; i < output->attr.dim_num; i++) - { - uint32_t sz0 = input0->attr.dim_num > i ? input0->attr.size[i] : 1; - uint32_t sz1 = input1->attr.dim_num > i ? input1->attr.size[i] : 1; - uint32_t sz2 = output->attr.dim_num > i ? output->attr.size[i] : 1; - - if (sz0 == sz1 && sz0 == 1) - { - continue; - } - else - { - sizes0[cnt] = sz0; - sizes1[cnt] = sz1; - sizes2[cnt] = sz2; - - cnt ++; - dims ++; - } - } - - for (i = 0; i < dims; i++) - { - uint32_t sz0 = sizes0[i]; - uint32_t sz1 = sizes1[i]; - - if (sz0 != sz1) - { - enable_broadcast = TRUE; - broadcast_Bits |= (1 << i); - } - } - - /*************step 3:reshape tensor shape*****************/ - if (enable_broadcast == FALSE || enable_broadcast1) - { - vsi_nn_OptimizedEltOPShape(input0, sizes0, &dims); - vsi_nn_OptimizedEltOPShape(input1, sizes1, &dims); - vsi_nn_OptimizedEltOPShape(output, sizes2, &dims); - } - else - { -#define VSI_NN_MAX_IMAGE_WIDTH (65536) - switch (broadcast_Bits) - { - case VSI_NN_BROAD_CAST_BITS_0: - { - vx_uint32 element_count = 1; - vx_uint32 divisors = 1; - - for (i = 1; i < dims; i++) - { - element_count *= sizes0[i]; - } - - divisors = 1; - vsi_nn_GetDataDivisors(element_count, &divisors, 1); - - sizes0[1] = divisors; - sizes1[1] = divisors; - sizes2[1] = divisors; - sizes0[2] = element_count / divisors; - sizes1[2] = element_count / divisors; - sizes2[2] = element_count / divisors; - dims = 3; - - break; - } - case VSI_NN_BROAD_CAST_BITS_0 | VSI_NN_BROAD_CAST_BITS_1: - case VSI_NN_BROAD_CAST_BITS_0 | VSI_NN_BROAD_CAST_BITS_1 | VSI_NN_BROAD_CAST_BITS_2: - { - vx_uint32 w0 = sizes0[0] * sizes0[1]; - vx_uint32 w1 = sizes1[0] * sizes1[1]; - vx_uint32 w = sizes2[0] * sizes2[1]; - vx_uint32 h = sizes0[2]; - - if (h < VSI_NN_MAX_IMAGE_WIDTH && (w0 == 1 || w1 == 1) - && w < VSI_NN_MAX_IMAGE_WIDTH) - { - sizes0[0] = w0; - sizes1[0] = w1; - sizes2[0] = w; - sizes0[1] = sizes0[2]; - sizes1[1] = sizes1[2]; - sizes2[1] = sizes2[2]; - sizes0[2] = 1; - sizes1[2] = 1; - sizes2[2] = 1; - } - - break; - } - case VSI_NN_BROAD_CAST_BITS_2: - { - vx_uint32 w = sizes0[0] * sizes0[1]; - - if (w < VSI_NN_MAX_IMAGE_WIDTH) - { - sizes0[0] = w; - sizes1[0] = w; - sizes2[0] = w; - sizes0[1] = sizes0[2]; - sizes1[1] = sizes1[2]; - sizes2[1] = sizes2[2]; - sizes0[2] = 1; - sizes1[2] = 1; - sizes2[2] = 1; - } - - break; - } - default: - if (dims == output->attr.dim_num) - status = FALSE; - break; - } - } - -#undef VSI_NN_MAX_IMAGE_WIDTH - - if (status == TRUE) - *dim_num = vsi_nn_max(dims, 2); - - if (dims > 2 && sizes2[2] != 1) - { - status = FALSE; - } - - return status; -} - -void vsi_nn_print_int_array( int32_t* array, size_t size ) +void vsi_nn_print_size_array( vsi_size_t* array, size_t size ) { size_t i; size_t n; @@ -1110,7 +876,7 @@ void vsi_nn_print_int_array( int32_t* array, size_t size ) n = 0; for( i = 0; i < size; i ++ ) { - n += snprintf( &buf[n], _MSG_SIZE - n, "%d, ", array[i] ); + n += snprintf( &buf[n], _MSG_SIZE - n, "%"VSI_SIZE_T_SPECIFIER", ", array[i] ); if( n >= _MSG_SIZE ) { break; @@ -1118,7 +884,7 @@ void vsi_nn_print_int_array( int32_t* array, size_t size ) } VSILOGD( "%s", buf ); #undef _MSG_SIZE -} /* vsi_nn_print_int_array() */ +} /* vsi_nn_print_size_array() */ vsi_bool vsi_nn_IsEVISFeatureAvaiable ( diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c index 57ab550..b721265 100644 --- a/src/tim/vx/internal/src/vsi_nn_graph.c +++ b/src/tim/vx/internal/src/vsi_nn_graph.c @@ -739,7 +739,8 @@ vsi_status vsi_nn_SetupGraph goto final; } - /* Set all of tensor attribute in graph to high precision */ + /* set tensor's precision before compute_node + so that internal tensor can know the precision information*/ status = set_graph_precision(graph, nodes_list); if(VSI_SUCCESS != status) { @@ -753,6 +754,13 @@ vsi_status vsi_nn_SetupGraph goto final; } + /* set precision again to make sure any tensor created by compute_node have correct precesion infor*/ + status = set_graph_precision(graph, nodes_list); + if(VSI_SUCCESS != status) + { + goto final; + } + /* Try setup graph complete signal node. */ status = vsi_nn_TrySetupCompleteSignalNode( graph ); TEST_CHECK_STATUS( status, final ); diff --git a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c index 4cdbd82..c1551a6 100644 --- a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c +++ b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c @@ -540,9 +540,9 @@ static vx_tensor _create_const_raw_tensor #ifdef VSI_PERCHANNEL_QUANTIZATION_SUPPORT // This is a hack that driver doesn't support const scale scales = (float *)malloc(sizeof(float) * attr.dtype.scale_dim); - zeroPoints = (int32_t *)malloc(sizeof(int32_t) * attr.dtype.zero_points_dim); + zeroPoints = (int32_t *)malloc(sizeof(attr.dtype.zero_points[0]) * attr.dtype.zero_points_dim); memcpy(scales, attr.dtype.scales, attr.dtype.scale_dim * sizeof(float)); - memcpy(zeroPoints, attr.dtype.zero_points, attr.dtype.zero_points_dim * sizeof(float)); + memcpy(zeroPoints, attr.dtype.zero_points, attr.dtype.zero_points_dim * sizeof(attr.dtype.zero_points[0])); params.quant_data.affinePerChannel.channelDim = attr.dtype.channel_dim; params.quant_data.affinePerChannel.scaleCount = attr.dtype.scale_dim; params.quant_data.affinePerChannel.scales = scales; @@ -559,14 +559,14 @@ static vx_tensor _create_const_raw_tensor if( TRUE == attr.is_created_from_handle ) { vx_tensor_addressing addr; - uint32_t stride_size[VSI_NN_MAX_DIM_NUM]; - uint32_t buf_sz; + vsi_size_t stride_size[VSI_NN_MAX_DIM_NUM]; + vsi_size_t buf_sz; buf_sz = vsi_nn_GetStrideSize( &attr, stride_size ); if( buf_sz > 0 ) { - uint32_t align_start_size = graph->handle_manager.align_start_size; - uint32_t align_block_size = graph->handle_manager.align_block_size; + vsi_size_t align_start_size = graph->handle_manager.align_start_size; + vsi_size_t align_block_size = graph->handle_manager.align_block_size; if (data == NULL) { data = vsi_nn_MallocAlignedBuffer(buf_sz, align_start_size, @@ -592,8 +592,26 @@ static vx_tensor _create_const_raw_tensor } if( data ) { +#ifdef VSI_40BIT_VA_SUPPORT addr = vxCreateTensorAddressing(graph->ctx->c, - attr.size, stride_size, (vx_uint8)attr.dim_num); + attr.size, stride_size, (vsi_size_t)attr.dim_num); +#else + { + vsi_size_t i; + uint32_t size_32bit[_cnt_of_array(attr.size)] = {0}; + uint32_t stride_size_32bit[_cnt_of_array(stride_size)] = {0}; + for(i = 0; i < _cnt_of_array(attr.size); i++) + { + size_32bit[i] = (uint32_t)attr.size[i]; + } + for(i = 0; i < _cnt_of_array(stride_size); i++) + { + stride_size_32bit[i] = (uint32_t)stride_size[i]; + } + addr = vxCreateTensorAddressing(graph->ctx->c, + size_32bit, stride_size_32bit, (vx_uint8)attr.dim_num); + } +#endif #ifdef VX_13_NN_COMPATIBLITY tensor = vxCreateTensorFromHandle2(graph->ctx->c, ¶ms, sizeof(vx_tensor_create_params_t), @@ -678,8 +696,8 @@ static void _convert_const_I8toU8 uint8_t * data = NULL; vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id); vsi_nn_tensor_attr_t *attr = &tensor->attr; - uint32_t sz = 0; - uint32_t i = 0; + vsi_size_t sz = 0; + vsi_size_t i = 0; sz = vsi_nn_GetElementNum( tensor ); diff --git a/src/tim/vx/internal/src/vsi_nn_internal_node.c b/src/tim/vx/internal/src/vsi_nn_internal_node.c index 7d9a623..50b9d62 100644 --- a/src/tim/vx/internal/src/vsi_nn_internal_node.c +++ b/src/tim/vx/internal/src/vsi_nn_internal_node.c @@ -56,8 +56,8 @@ static vsi_nn_internal_node_t* vsi_nn_internal_create_node ( vsi_nn_graph_t* graph, vsi_nn_op_t op, - uint32_t input_num, - uint32_t output_num + vsi_size_t input_num, + vsi_size_t output_num ) { vsi_nn_internal_node_t* node = NULL; @@ -160,6 +160,7 @@ vsi_nn_internal_tensor_t* vsi_nn_internal_create_zero_bias_tensor vsi_nn_node_t* node, vsi_nn_tensor_attr_t* input_attr, vsi_nn_tensor_attr_t* weight_attr, + vsi_nn_op_t op, vsi_bool use_virtual_tensor ) { @@ -170,7 +171,25 @@ vsi_nn_internal_tensor_t* vsi_nn_internal_create_zero_bias_tensor memset(&attr, 0x0, sizeof(vsi_nn_tensor_attr_t)); /* create zero bias for NN/TP */ - attr.size[0] = weight_attr->size[1]; + switch(op) + { + case VSI_NN_OP_FCL: + case VSI_NN_OP_FCL2: + case VSI_NN_OP_FCL_RELU: + attr.size[0] = weight_attr->size[1]; + break; + case VSI_NN_OP_CONV2D: + case VSI_NN_OP_CONV_RELU: + case VSI_NN_OP_CONV_RELU_POOL: + case VSI_NN_OP_GROUPED_CONV2D: + attr.size[0] = weight_attr->size[3]; + break; + default: + attr.size[0] = weight_attr->size[1]; // default is FC + VSILOGW("Ovxlib only auto fill bias for conv2d and fc, but current op is %s\n", + vsi_nn_OpGetName(op)); + break; + } attr.dim_num = 1; attr.vtl = use_virtual_tensor; attr.is_const = !use_virtual_tensor; @@ -422,7 +441,7 @@ void vsi_nn_internal_init_tensor_attr { memset(attr, 0x00, sizeof(vsi_nn_tensor_attr_t)); - //memset(attr->size, 0, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + //memset(attr->size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); attr->dim_num = VSI_NN_DIM_AUTO; attr->vtl = use_virtual_tensor; attr->is_const = FALSE; @@ -445,8 +464,8 @@ vsi_nn_internal_node_t* vsi_nn_internal_new_node ( vsi_nn_node_t* node, vsi_nn_op_t op, - uint32_t input_num, - uint32_t output_num + vsi_size_t input_num, + vsi_size_t output_num ) { vsi_nn_internal_node_t* inode = NULL; @@ -589,6 +608,41 @@ vsi_bool vsi_nn_internal_setup_node return retn; } /* vsi_nn_internal_setup_node() */ +static vsi_status _set_reference_tensor_name + ( + vsi_nn_graph_t *graph, + vsi_nn_node_t* node, + vsi_nn_node_t* sub_node, + vsi_nn_tensor_t ** outputs + ) +{ +#define _NODE_ID_LEN 64 + vsi_status status; + vsi_nn_tensor_t *tensor; + uint32_t i; + char name[_NODE_ID_LEN]; + if (NULL == node || NULL == graph) + { + return VSI_FAILURE; + } + + status = VSI_SUCCESS; + for (i = 0; i < sub_node->output.num; i++) + { + memset(name, 0, sizeof(char) * _NODE_ID_LEN); + snprintf(name, sizeof(char) * _NODE_ID_LEN, "uid_%u_sub_uid_%u_out_%u", node->uid, sub_node->uid, i); + tensor = outputs[i]; + if (tensor && tensor->t) + { + status = vxSetReferenceName((vx_reference)tensor->t, name); + TEST_CHECK_STATUS(status, final); + } + } + +final: + return status; +} /* _set_reference_tensor_name() */ + vsi_status vsi_nn_internal_compute_node ( vsi_nn_node_t* node @@ -610,6 +664,13 @@ vsi_status vsi_nn_internal_compute_node VSILOGD("Compute node uid[%u] sub_uid[%u] op[%s]", node->uid, curr->node->uid, vsi_nn_OpGetName(curr->node->op)); + + status = _set_reference_tensor_name(node->graph, node, curr->node, curr->outputs); + if ( VSI_SUCCESS != status ) + { + VSILOGW("Set reference node[%d] sub_uid[%u] %s output tensor name fail", + node->uid, curr->node->uid, vsi_nn_OpGetName(curr->node->op)); + } status = vsi_nn_OpCompute( curr->node->op, curr->node, curr->inputs, curr->outputs ); if( VSI_SUCCESS != status ) { diff --git a/src/tim/vx/internal/src/vsi_nn_node.c b/src/tim/vx/internal/src/vsi_nn_node.c index 98f05a7..0d5fbc8 100644 --- a/src/tim/vx/internal/src/vsi_nn_node.c +++ b/src/tim/vx/internal/src/vsi_nn_node.c @@ -38,8 +38,8 @@ vsi_nn_node_t * vsi_nn_NewNode ( vsi_nn_graph_t * graph, vsi_nn_op_t op, - uint32_t input_num, - uint32_t output_num + vsi_size_t input_num, + vsi_size_t output_num ) { vsi_nn_node_t * node; @@ -70,16 +70,16 @@ vsi_nn_node_t * vsi_nn_NewNode } /* init output struct */ - node->output.num = output_num; + node->output.num = (uint32_t)output_num; node->output.tensors = (vsi_nn_tensor_id_t *) malloc( output_num * sizeof( vsi_nn_tensor_id_t ) ); - vsi_nn_InitTensorsId( node->output.tensors, output_num ); + vsi_nn_InitTensorsId( node->output.tensors, (uint32_t)output_num ); /* init input struct */ - node->input.num = input_num; + node->input.num = (uint32_t)input_num; node->input.tensors = (vsi_nn_tensor_id_t *) malloc( input_num * sizeof( vsi_nn_tensor_id_t ) ); - vsi_nn_InitTensorsId( node->input.tensors, input_num ); + vsi_nn_InitTensorsId( node->input.tensors, (uint32_t)input_num ); node->attr.const_tensor_preload_type = VSI_NN_NODE_PRELOAD_NONE; node->attr.enable_op_constraint_check = TRUE; } diff --git a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c index e0a5bd6..1f46c3f 100644 --- a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c +++ b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c @@ -190,6 +190,11 @@ static _node_template s_template[] = /* SEQUENCE_MASK */ NULL, /* REPEAT */ NULL, /* SCATTER_ND_UPDATE */ NULL, + /* CONV2D_LSTM */ NULL, + /* CONV2D_LSTM_CELL */ NULL, + /* GRU */ NULL, + /* GRUCELL */ NULL, + /* GRUCELL_ACTIVATION */ NULL, }; //_compiler_assert( _cnt_of_array(s_template) == VSI_NN_OP_NUM, vsi_nn_node_attr_template_c ); diff --git a/src/tim/vx/internal/src/vsi_nn_ops.c b/src/tim/vx/internal/src/vsi_nn_ops.c index 84cdfb1..0214680 100644 --- a/src/tim/vx/internal/src/vsi_nn_ops.c +++ b/src/tim/vx/internal/src/vsi_nn_ops.c @@ -284,8 +284,8 @@ void vsi_nn_OpGetIoNum ( vsi_nn_op_t op, vsi_nn_node_t * node, - uint32_t * input_num, - uint32_t * output_num + vsi_size_t * input_num, + vsi_size_t * output_num ) { const vsi_nn_op_proc_t * proc; @@ -294,11 +294,11 @@ void vsi_nn_OpGetIoNum { if( NULL != input_num ) { - *input_num = proc->input_num; + *input_num = (uint32_t)proc->input_num; } if( NULL != output_num ) { - *output_num = proc->output_num; + *output_num = (uint32_t)proc->output_num; } } } /* vsi_nn_OpGetIoNum() */ diff --git a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c index a97bd7f..fe05e1e 100644 --- a/src/tim/vx/internal/src/vsi_nn_pre_post_process.c +++ b/src/tim/vx/internal/src/vsi_nn_pre_post_process.c @@ -37,8 +37,8 @@ static void _create_yuv_norm_tensors vsi_nn_tensor_id_t* yuv_tensors ) { - int w = 0; - int h = 0; + vsi_size_t w = 0; + vsi_size_t h = 0; vsi_nn_tensor_attr_t y_input_attr; vsi_nn_tensor_attr_t uv_input_attr; @@ -118,7 +118,7 @@ static void _set_preproc_node_rect_params ( vsi_nn_node_t* node, vsi_nn_preprocess_crop_t* crop, - vsi_nn_tensor_attr_t attr, + vsi_nn_tensor_attr_t* attr, vsi_nn_preprocess_source_layout_e* source_layout ) { @@ -133,12 +133,12 @@ static void _set_preproc_node_rect_params { node->nn_param.pre_process.rect.left = 0; node->nn_param.pre_process.rect.top = 0; - node->nn_param.pre_process.rect.width = attr.size[0]; - node->nn_param.pre_process.rect.height = attr.size[1]; + node->nn_param.pre_process.rect.width = (uint32_t)attr->size[0]; + node->nn_param.pre_process.rect.height = (uint32_t)attr->size[1]; if(*source_layout == VSI_NN_SOURCE_LAYOUT_NHWC) { - node->nn_param.pre_process.rect.width = attr.size[1]; - node->nn_param.pre_process.rect.height = attr.size[2]; + node->nn_param.pre_process.rect.width = (uint32_t)attr->size[1]; + node->nn_param.pre_process.rect.height = (uint32_t)attr->size[2]; } } } /* _set_preproc_node_rect_params() */ @@ -147,7 +147,7 @@ static void _set_preproc_node_norm_params ( vsi_nn_node_t* node, vsi_nn_preprocess_mean_and_scale_t* mean_and_scale, - vsi_nn_tensor_attr_t attr + vsi_nn_tensor_attr_t* attr ) { int32_t i = 0; @@ -161,7 +161,7 @@ static void _set_preproc_node_norm_params } else { - for(i = 0; i < (int32_t)attr.dim_num - 1; i++) + for(i = 0; i < (int32_t)attr->dim_num - 1; i++) { node->nn_param.pre_process.norm.mean[i] = 0; } @@ -173,13 +173,13 @@ static void _set_preproc_node_out_attr ( vsi_nn_node_t* node, vsi_nn_preprocess_image_resize_t* image_resize, - vsi_nn_tensor_t* org_norm_tensor, + vsi_nn_tensor_attr_t* attr, vsi_nn_preprocess_source_layout_e* source_layout ) { - node->nn_param.pre_process.dim_num = org_norm_tensor->attr.dim_num; - node->nn_param.pre_process.output_attr.dim_num = org_norm_tensor->attr.dim_num; - node->nn_param.pre_process.output_attr.size = org_norm_tensor->attr.size; + node->nn_param.pre_process.dim_num = attr->dim_num; + node->nn_param.pre_process.output_attr.dim_num = attr->dim_num; + node->nn_param.pre_process.output_attr.size = attr->size; if(image_resize != NULL) { node->nn_param.pre_process.output_attr.size[0] = image_resize->w; @@ -197,14 +197,14 @@ static void _set_preproc_node_out_attr static void _set_preproc_node_input_attr ( vsi_nn_tensor_attr_t* input_attr, - vsi_nn_tensor_t* org_norm_tensor, + vsi_nn_tensor_attr_t* attr, vsi_nn_preprocess_image_size_t* input_size, vsi_nn_preprocess_source_format_e* source_format, vsi_nn_preprocess_source_layout_e* source_layout ) { - *input_attr = org_norm_tensor->attr; - input_attr->dim_num = org_norm_tensor->attr.dim_num; + *input_attr = *attr; + input_attr->dim_num = attr->dim_num; if(input_size != NULL) { input_attr->size[0] = input_size->w; @@ -271,11 +271,11 @@ static void _set_preproc_node_input_attr static void _set_preproc_node_output_attr ( vsi_nn_tensor_attr_t* output_attr, - vsi_nn_tensor_t* org_norm_tensor, + vsi_nn_tensor_attr_t* attr, vsi_nn_preprocess_dtype_convert_t* data_convert ) { - *output_attr = org_norm_tensor->attr; + *output_attr = *attr; if(data_convert != NULL) { output_attr->dtype = data_convert->dtype; @@ -289,13 +289,13 @@ static void _set_preproc_node_output_attr static void _set_postproc_node_output_attr ( vsi_nn_tensor_attr_t* output_attr, - vsi_nn_tensor_t* org_norm_tensor, + vsi_nn_tensor_attr_t* attr, vsi_nn_postprocess_permute_t* permute, vsi_nn_postprocess_dtype_convert_t* dtype_convert ) { int32_t i = 0; - output_attr->dim_num = org_norm_tensor->attr.dim_num; + output_attr->dim_num = attr->dim_num; output_attr->is_const = FALSE; output_attr->vtl = FALSE; if(dtype_convert != NULL) @@ -304,28 +304,103 @@ static void _set_postproc_node_output_attr } else { - output_attr->dtype = org_norm_tensor->attr.dtype; + output_attr->dtype = attr->dtype; } if(permute != NULL) { for(i = 0; i < permute->dim; i++) { - output_attr->size[i] = org_norm_tensor->attr.size[permute->perm[i]]; + output_attr->size[i] = attr->size[permute->perm[i]]; } } else { - for(i = 0; i < (int32_t)org_norm_tensor->attr.dim_num; i++) + for(i = 0; i < (int32_t)attr->dim_num; i++) { - output_attr->size[i] = org_norm_tensor->attr.size[i]; + output_attr->size[i] = attr->size[i]; } } } /* _set_postproc_node_output_attr() */ +static void _reconnect_graph_inputs + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_id_t org_input, + uint32_t input_idx, + vsi_nn_tensor_id_t* inputs, + uint32_t inputs_num + ) +{ + vsi_nn_tensor_id_t cur_input; + uint32_t i; + uint32_t final_idx; + + final_idx = input_idx; + /* get the new input idx */ + for(i = input_idx; i < graph->input.num; i++) + { + cur_input = graph->input.tensors[i]; + if(cur_input == org_input) + { + final_idx = i; + break; + } + } + /* move next inputs to save space for new inputs*/ + for(i = graph->input.num-1; i > final_idx + inputs_num - 1; i--) + { + graph->input.tensors[i] = graph->input.tensors[i - inputs_num + 1]; + } + + /* connect new inputs */ + for(i = 0; i < inputs_num; i++) + { + graph->input.tensors[final_idx + i] = inputs[i]; + } +}/* _reconnect_graph_inputs() */ + +static void _get_org_graph_inputs + ( + vsi_nn_graph_t* graph, + vsi_nn_tensor_id_t* graph_inputs + ) +{ + uint32_t i; + uint32_t idx = 0; + vsi_nn_tensor_id_t cur_input; + uint32_t nodes_count = 0; + vsi_nn_node_t* nodes[1] = {NULL}; + for(i = 0; i < graph->input.num; i++) + { + cur_input = graph->input.tensors[i]; + vsi_nn_get_tensor_consumers(graph, cur_input, NULL, &nodes_count); + if(nodes_count == 1) + { + vsi_nn_get_tensor_consumers(graph, cur_input, nodes, NULL); + if(nodes[0]->op == VSI_NN_OP_PRE_PROCESS) + { + if(nodes[0]->nn_param.pre_process.type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420 || + nodes[0]->nn_param.pre_process.type == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444 ) + { + i += 2 ; + } + else if(nodes[0]->nn_param.pre_process.type == VSI_NN_SOURCE_FORMAT_IMAGE_NV12) + { + i += 1; + } + } + } + + graph_inputs[idx] = cur_input; + idx += 1; + } +}/* _get_org_graph_inputs() */ + vsi_status vsi_nn_add_single_preproc_node ( vsi_nn_graph_t* graph, uint32_t input_idx, + vsi_nn_tensor_id_t org_input, vsi_nn_node_t** first_node, uint32_t nodes_count, vsi_nn_preprocess_base_t* preprocess, @@ -344,10 +419,8 @@ vsi_status vsi_nn_add_single_preproc_node vsi_nn_preprocess_dtype_convert_t* data_convert = NULL; vsi_nn_tensor_attr_t input_attr; vsi_nn_tensor_attr_t output_attr; - vsi_nn_tensor_attr_t attr; - vsi_nn_tensor_id_t preproc_input; + vsi_nn_tensor_id_t preproc_inputs[3] = {0}; vsi_nn_tensor_id_t preproc_output; - vsi_nn_tensor_id_t yuv_inputs[3]; vsi_nn_tensor_t* org_norm_tensor = NULL; uint32_t node_input_num = 1; int32_t reverse_channel = 0; @@ -355,8 +428,7 @@ vsi_status vsi_nn_add_single_preproc_node uint32_t j = 0; uint32_t idx =0; - org_norm_tensor = vsi_nn_GetTensor(graph,graph->input.tensors[input_idx]); - attr = org_norm_tensor->attr; + org_norm_tensor = vsi_nn_GetTensor(graph, org_input); /* Get preprocess configurations*/ for(idx = 0; idx < proc_count; idx++) @@ -424,12 +496,12 @@ vsi_status vsi_nn_add_single_preproc_node status = _set_preproc_node_type(node, source_format); TEST_CHECK_STATUS(status, final); - _set_preproc_node_rect_params(node, crop, attr, source_layout); - _set_preproc_node_norm_params(node, mean_and_scale, attr); + _set_preproc_node_rect_params(node, crop, &org_norm_tensor->attr, source_layout); + _set_preproc_node_norm_params(node, mean_and_scale, &org_norm_tensor->attr); if(permute != NULL) { - if((uint32_t)permute->dim != attr.dim_num) + if((uint32_t)permute->dim != org_norm_tensor->attr.dim_num) { VSILOGE("Preprocess permute dim dosen't match input dim"); status = VSI_FAILURE; @@ -439,27 +511,33 @@ vsi_status vsi_nn_add_single_preproc_node } if(reverse_channel) + { node->nn_param.pre_process.reverse_channel = TRUE; + } else + { node->nn_param.pre_process.reverse_channel = FALSE; + } - _set_preproc_node_out_attr(node, image_resize, org_norm_tensor, source_layout); + _set_preproc_node_out_attr(node, image_resize, &org_norm_tensor->attr, source_layout); /* Set input tensor attr */ - _set_preproc_node_input_attr(&input_attr, org_norm_tensor, input_size, source_format, source_layout); + _set_preproc_node_input_attr(&input_attr, &org_norm_tensor->attr, input_size, source_format, source_layout); /* Set output tensor attr */ - _set_preproc_node_output_attr(&output_attr, org_norm_tensor, data_convert); + _set_preproc_node_output_attr(&output_attr, &org_norm_tensor->attr, data_convert); /* Create new norm and virtual tensors */ if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420 || *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12 || *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444) { - _create_yuv_norm_tensors(graph, &input_attr, source_layout, source_format, yuv_inputs); + _create_yuv_norm_tensors(graph, &input_attr, source_layout, source_format, preproc_inputs); + } + else + { + preproc_inputs[0] = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &input_attr, NULL); } - - preproc_input = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &input_attr, NULL); preproc_output = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &output_attr, NULL); /* Reconnect node tensors */ @@ -467,7 +545,7 @@ vsi_status vsi_nn_add_single_preproc_node { for(j = 0; j < first_node[i]->input.num; j++) { - if(first_node[i]->input.tensors[j] == graph->input.tensors[input_idx]) + if(first_node[i]->input.tensors[j] == org_input) { first_node[i]->input.tensors[j] = preproc_output; break; @@ -475,21 +553,12 @@ vsi_status vsi_nn_add_single_preproc_node } } - if (*source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV420 || - *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_NV12 || - *source_format == VSI_NN_SOURCE_FORMAT_IMAGE_YUV444) + for(i = 0; i < node_input_num; i++) { - for (i = 0; i < node_input_num; i++) - { - node->input.tensors[i] = yuv_inputs[i]; - graph->input.tensors[input_idx*node_input_num+i] = yuv_inputs[i]; - } - } - else - { - node->input.tensors[0] = preproc_input; - graph->input.tensors[input_idx] = preproc_input; + node->input.tensors[i] = preproc_inputs[i]; } + _reconnect_graph_inputs(graph, org_input, input_idx, preproc_inputs, node_input_num); + node->output.tensors[0] = preproc_output; status = VSI_SUCCESS; @@ -558,7 +627,7 @@ vsi_status vsi_nn_add_single_postproc_node input_attr.vtl = TRUE; /* Set output tensor attr */ - _set_postproc_node_output_attr(&output_attr, org_norm_tensor, permute, dtype_convert); + _set_postproc_node_output_attr(&output_attr, &org_norm_tensor->attr, permute, dtype_convert); /* Create new norm and virtual tensor */ postproc_input = vsi_nn_AddTensor(graph, VSI_NN_TENSOR_ID_AUTO, &input_attr, NULL); @@ -618,20 +687,29 @@ vsi_status vsi_nn_AddGraphPreProcess vsi_nn_tensor_id_t input; uint32_t nodes_count = 0; vsi_nn_node_t** nodes = NULL; + vsi_nn_tensor_id_t* graph_inputs=NULL; - input = graph->input.tensors[input_idx]; + graph_inputs = (vsi_nn_tensor_id_t*)malloc(sizeof(vsi_nn_tensor_id_t)*graph->input.num); + _get_org_graph_inputs(graph, graph_inputs); + input = graph_inputs[input_idx]; vsi_nn_get_tensor_consumers(graph, input, NULL, &nodes_count); if(nodes_count != 0) { nodes = (vsi_nn_node_t**)malloc(sizeof(vsi_nn_node_t*)*nodes_count); vsi_nn_get_tensor_consumers(graph, input, nodes, NULL); - status = vsi_nn_add_single_preproc_node(graph, input_idx, nodes, nodes_count, preprocess, count); + status = vsi_nn_add_single_preproc_node(graph, input_idx, input, nodes, nodes_count, preprocess, count); } + if(nodes) { free(nodes); nodes = NULL; } + if(graph_inputs) + { + free(graph_inputs); + graph_inputs = NULL; + } return status; } /* vsi_nn_AddGraphPreProcess() */ diff --git a/src/tim/vx/internal/src/vsi_nn_rnn.c b/src/tim/vx/internal/src/vsi_nn_rnn.c index 991817d..2a3baab 100644 --- a/src/tim/vx/internal/src/vsi_nn_rnn.c +++ b/src/tim/vx/internal/src/vsi_nn_rnn.c @@ -48,10 +48,10 @@ static vsi_status internal_buffer_init ) { vsi_status status = VSI_FAILURE; - uint32_t element_num = 0; - uint32_t i = 0; + vsi_size_t element_num = 0; + vsi_size_t i = 0; uint32_t stride = 0; - uint32_t data_size = 0; + vsi_size_t data_size = 0; uint8_t* data = NULL; if( TRUE == tensor->attr.vtl ) @@ -126,7 +126,7 @@ static vsi_status internal_buffer_copy_to_tensor ) { vsi_status status = VSI_FAILURE; - uint32_t request_data_size = 0; + vsi_size_t request_data_size = 0; vsi_nn_tensor_t* tensor = NULL; if( NULL == buffer ) @@ -156,7 +156,7 @@ static vsi_status internal_buffer_copy_from_tensor ) { vsi_status status = VSI_FAILURE; - uint32_t request_data_size = 0; + vsi_size_t request_data_size = 0; uint8_t* data = NULL; vsi_nn_tensor_t* tensor = NULL; diff --git a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c index b389bc2..9ba72ad 100644 --- a/src/tim/vx/internal/src/vsi_nn_rnn_helper.c +++ b/src/tim/vx/internal/src/vsi_nn_rnn_helper.c @@ -114,7 +114,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_input_for_nn_fc vsi_nn_tensor_attr_t attr; vsi_nn_internal_tensor_t* tensor1 = NULL; vsi_nn_internal_tensor_t* tensor2 = NULL; - uint32_t* reshape_in_size = NULL; + vsi_size_t* reshape_in_size = NULL; uint32_t* permute_in_perm = NULL; vsi_nn_internal_node_t* tmp_inode = NULL; @@ -123,7 +123,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_input_for_nn_fc tensor1 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE, 0, 0 ); - reshape_in_size = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode, 4 * sizeof(uint32_t)); + reshape_in_size = (vsi_size_t*)vsi_nn_internal_new_node_param(tmp_inode, 4 * sizeof(vsi_size_t)); reshape_in_size[3] = input->attr.size[1]; reshape_in_size[2] = input->attr.size[0] / (kernel_h * kernel_w); @@ -138,8 +138,8 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_input_for_nn_fc if( multi_batch ) { - uint32_t reshape_size[4] = { 0 }; - uint32_t c = 0, h = 0; + vsi_size_t reshape_size[4] = { 0 }; + vsi_size_t c = 0, h = 0; vsi_nn_internal_tensor_t* tensor0 = NULL; h = tensor1->t->attr.size[2]; c = tensor1->t->attr.size[1]; @@ -186,7 +186,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_output_for_nn_fc vsi_nn_tensor_attr_t attr; vsi_nn_internal_tensor_t* tensor1 = NULL; vsi_nn_internal_tensor_t* tensor2 = NULL; - uint32_t* reshape_in_size = NULL; + vsi_size_t* reshape_in_size = NULL; uint32_t* permute_in_perm = NULL; vsi_nn_internal_node_t* tmp_inode = NULL; vsi_nn_tensor_t* tensor = input; @@ -196,8 +196,8 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_output_for_nn_fc if( multi_batch ) { - uint32_t reshape_size[4] = { 0 }; - uint32_t c = 0, h = 0; + vsi_size_t reshape_size[4] = { 0 }; + vsi_size_t c = 0, h = 0; vsi_nn_internal_tensor_t* tensor0 = NULL; h = tensor->attr.size[2]; c = tensor->attr.size[1]; @@ -232,7 +232,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_process_output_for_nn_fc tensor2 = vsi_nn_internal_new_tensor(self, &attr, 0.0f); tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE, 0, 0 ); - reshape_in_size = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode, 4 * sizeof(uint32_t)); + reshape_in_size = (vsi_size_t *)vsi_nn_internal_new_node_param(tmp_inode, 4 * sizeof(vsi_size_t)); reshape_in_size[1] = tensor->attr.size[3]; reshape_in_size[0] = tensor->attr.size[2]; @@ -259,7 +259,7 @@ vsi_bool vsi_nn_rnn_process_output_for_nn_fc2 { vsi_nn_tensor_attr_t attr; vsi_nn_internal_tensor_t* tensor1 = NULL; - uint32_t* reshape_in_size = NULL; + vsi_size_t* reshape_in_size = NULL; uint32_t* permute_in_perm = NULL; vsi_nn_internal_node_t* tmp_inode = NULL; vsi_nn_tensor_t* tensor = input; @@ -269,8 +269,8 @@ vsi_bool vsi_nn_rnn_process_output_for_nn_fc2 if( multi_batch ) { - uint32_t reshape_size[4] = { 0 }; - uint32_t c = 0, h = 0; + vsi_size_t reshape_size[4] = { 0 }; + vsi_size_t c = 0, h = 0; vsi_nn_internal_tensor_t* tensor0 = NULL; h = tensor->attr.size[2]; c = tensor->attr.size[1]; @@ -304,7 +304,7 @@ vsi_bool vsi_nn_rnn_process_output_for_nn_fc2 } tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE, 0, 0 ); - reshape_in_size = (uint32_t *)vsi_nn_internal_new_node_param(tmp_inode, 4 * sizeof(uint32_t)); + reshape_in_size = (vsi_size_t*)vsi_nn_internal_new_node_param(tmp_inode, 4 * sizeof(vsi_size_t)); reshape_in_size[1] = tensor->attr.size[3]; reshape_in_size[0] = tensor->attr.size[2]; @@ -339,7 +339,8 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tp_fc if( !bias ) { /* create zero bias for NN/TP */ - tensor1 = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr, FALSE); + tensor1 = vsi_nn_internal_create_zero_bias_tensor( + self, &input->attr, &weight->attr, VSI_NN_OP_FCL, FALSE); tensor = tensor1->t; } vsi_nn_internal_init_tensor_attr(&attr, output_dtype, use_virtual_tensor); @@ -347,7 +348,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_tp_fc tmp_inode = vsi_nn_internal_new_node(self, VSI_NN_OP_FCL, 0, 0 ); tmp_inode->node->nn_param.fcl.axis = 0; - tmp_inode->node->nn_param.fcl.weights = weight->attr.size[1]; + tmp_inode->node->nn_param.fcl.weights = (uint32_t)weight->attr.size[1]; tmp_inode->inputs[0] = input; tmp_inode->inputs[1] = weight; @@ -382,7 +383,8 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_nn_fc if( !bias ) { /* create zero bias for NN/TP */ - tensor1 = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr, FALSE); + tensor1 = vsi_nn_internal_create_zero_bias_tensor( + self, &input->attr, &weight->attr, VSI_NN_OP_FCL, FALSE); tensor = tensor1->t; } @@ -403,7 +405,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_nn_fc tmp_inode->node->nn_param.conv2d.group = 1; tmp_inode->node->nn_param.conv2d.dilation[0] = 1; tmp_inode->node->nn_param.conv2d.dilation[1] = 1; - tmp_inode->node->nn_param.conv2d.weights = weight->attr.size[1]; + tmp_inode->node->nn_param.conv2d.weights = (uint32_t)weight->attr.size[1]; tmp_inode->inputs[0] = input; tmp_inode->inputs[1] = reshaped_weight_tensor->t; @@ -424,14 +426,14 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_prepare_weight_for_nn_fc { vsi_nn_tensor_attr_t attr; vsi_nn_internal_tensor_t* reshaped_weight_tensor = NULL; - uint32_t reshaped_weight_shape[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t reshaped_weight_shape[VSI_NN_MAX_DIM_NUM] = { 0 }; reshaped_weight_shape[3] = weight->attr.size[1]; reshaped_weight_shape[2] = weight->attr.size[0] / ( kernel_h * kernel_w ); reshaped_weight_shape[1] = kernel_h; reshaped_weight_shape[0] = kernel_w; - memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + memset( attr.size, 0, VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); attr.dim_num = VSI_NN_DIM_AUTO; attr.vtl = weight->attr.vtl; attr.is_const = FALSE; @@ -475,7 +477,8 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_nn_fc_relu if( !bias ) { /* create zero bias for NN/TP */ - tensor1 = vsi_nn_internal_create_zero_bias_tensor(self, &input->attr, &weight->attr, FALSE); + tensor1 = vsi_nn_internal_create_zero_bias_tensor( + self, &input->attr, &weight->attr, VSI_NN_OP_FCL, FALSE); tensor = tensor1->t; } @@ -496,7 +499,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_nn_fc_relu tmp_inode->node->nn_param.conv2d.group = 1; tmp_inode->node->nn_param.conv2d.dilation[0] = 1; tmp_inode->node->nn_param.conv2d.dilation[1] = 1; - tmp_inode->node->nn_param.conv2d.weights = weight->attr.size[1]; + tmp_inode->node->nn_param.conv2d.weights = (uint32_t)weight->attr.size[1]; tmp_inode->node->vx_param.overflow_policy = VX_CONVERT_POLICY_WRAP; tmp_inode->node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_ZERO; tmp_inode->node->vx_param.has_relu = has_relu; @@ -684,11 +687,11 @@ void vsi_nn_rnn_data_check_aligned vsi_nn_tensor_attr_t attr; vsi_nn_internal_tensor_t* output_tensor = NULL; uint32_t i = 0; - uint32_t ofst = 0; + vsi_size_t ofst = 0; ofst = 0; for( i = 0; i < time_step; i++ ) { - uint32_t tensor_size = vsi_nn_GetTensorSize( input[i]->attr.size, + vsi_size_t tensor_size = vsi_nn_GetTensorSize( input[i]->attr.size, input[i]->attr.dim_num, input[i]->attr.dtype.vx_type ); if( ofst & 0x3f ) @@ -719,7 +722,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_split_output vsi_nn_internal_node_t* curr = NULL; vsi_nn_tensor_attr_t attr; vsi_nn_internal_tensor_t* output_tensor = NULL; - uint32_t *reshape_split_size = NULL; + vsi_size_t *reshape_split_size = NULL; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); /* reshape for split output */ @@ -727,8 +730,8 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_split_output output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 ); - reshape_split_size = (uint32_t *)vsi_nn_internal_new_node_param(curr, - VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + reshape_split_size = (vsi_size_t *)vsi_nn_internal_new_node_param(curr, + VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); reshape_split_size[0] = -1; reshape_split_size[1] = batch_size; @@ -752,7 +755,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_cell_output vsi_nn_internal_node_t* curr = NULL; vsi_nn_tensor_attr_t attr; vsi_nn_internal_tensor_t* output_tensor = NULL; - uint32_t* reshape_grucell_output_size = NULL; + vsi_size_t* reshape_grucell_output_size = NULL; memset(&attr, 0, sizeof(vsi_nn_tensor_attr_t)); @@ -761,8 +764,8 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_reshape_cell_output output_tensor = vsi_nn_internal_new_tensor( self, &attr, 0.0f ); curr = vsi_nn_internal_new_node( self, VSI_NN_OP_RESHAPE, 0, 0 ); - reshape_grucell_output_size = (uint32_t *)vsi_nn_internal_new_node_param(curr, - VSI_NN_MAX_DIM_NUM * sizeof(uint32_t)); + reshape_grucell_output_size = (vsi_size_t*)vsi_nn_internal_new_node_param(curr, + VSI_NN_MAX_DIM_NUM * sizeof(vsi_size_t)); reshape_grucell_output_size[0] = -1; reshape_grucell_output_size[1] = batch_size; reshape_grucell_output_size[2] = 1; @@ -880,7 +883,7 @@ vsi_nn_internal_tensor_t** vsi_nn_create_split if(!slices) { slices = (uint32_t *)vsi_nn_internal_new_node_param(curr, slices_num * sizeof(uint32_t)); - num_per_output = tensor->attr.size[axis] / slices_num; + num_per_output = (uint32_t)(tensor->attr.size[axis] / slices_num); for( i = 0; i < slices_num; i++ ) { slices[i] = num_per_output; @@ -910,21 +913,21 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_reshape vsi_nn_node_t* self, vsi_nn_tensor_t* input_tensor, vsi_nn_tensor_t* output_tensor, - uint32_t* size, - uint32_t dim_num, + vsi_size_t* size, + vsi_size_t dim_num, vsi_bool use_virtual_tensor ) { vsi_nn_internal_node_t* curr = NULL; vsi_nn_internal_tensor_t* tensor0 = NULL; - uint32_t* reshape_in_size = NULL; + vsi_size_t* reshape_in_size = NULL; curr = vsi_nn_internal_new_node(self, VSI_NN_OP_RESHAPE, 0, 0 ); - reshape_in_size = (uint32_t *)vsi_nn_internal_new_node_param(curr, dim_num * sizeof(uint32_t)); - memcpy(reshape_in_size, size, dim_num * sizeof(uint32_t)); + reshape_in_size = (vsi_size_t*)vsi_nn_internal_new_node_param(curr, dim_num * sizeof(vsi_size_t)); + memcpy(reshape_in_size, size, dim_num * sizeof(vsi_size_t)); curr->node->nn_param.reshape.size = reshape_in_size; - curr->node->nn_param.reshape.dim_num = dim_num; + curr->node->nn_param.reshape.dim_num = (uint32_t)dim_num; curr->inputs[0] = input_tensor; curr->outputs[0] = output_tensor; @@ -950,8 +953,8 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_permute vsi_nn_node_t* self, vsi_nn_tensor_t* input_tensor, vsi_nn_tensor_t* output_tensor, - uint32_t* perm, - uint32_t dim_num, + vsi_size_t* perm, + vsi_size_t dim_num, vsi_bool use_virtual_tensor ) { @@ -964,7 +967,7 @@ vsi_nn_internal_tensor_t* vsi_nn_rnn_create_permute dim_num * sizeof(uint32_t)); memcpy(permute_in_perm, perm, dim_num * sizeof(uint32_t)); curr->node->nn_param.permute.perm = permute_in_perm; - curr->node->nn_param.permute.dim_num = dim_num; + curr->node->nn_param.permute.dim_num = (uint32_t)dim_num; curr->inputs[0] = input_tensor; if(output_tensor) diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c index 263d26f..3f662b6 100644 --- a/src/tim/vx/internal/src/vsi_nn_tensor.c +++ b/src/tim/vx/internal/src/vsi_nn_tensor.c @@ -45,10 +45,10 @@ static vsi_bool _try_set_const_tensor static vsi_bool _auto_cal_shape ( - uint32_t * input_shape, - uint32_t input_dim, - uint32_t * shape, - uint32_t * dim_num + vsi_size_t * input_shape, + vsi_size_t input_dim, + vsi_size_t * shape, + vsi_size_t * dim_num ); static vsi_bool _init_tensor @@ -65,21 +65,21 @@ static vsi_nn_tensor_t * _create_tensor vsi_nn_tensor_attr_t * attr ); -static uint32_t get_tensor_elements_num +static vsi_size_t get_tensor_elements_num ( - const uint32_t * shape, - uint32_t dim_num, + const vsi_size_t * shape, + vsi_size_t dim_num, vsi_nn_type_e type ) { - uint32_t num; - uint32_t sz; + vsi_size_t num; + vsi_size_t sz; uint32_t dsize; sz = vsi_nn_GetTensorSize( shape, dim_num, type ); dsize = vsi_nn_GetTypeBytes( type ); - num = (uint32_t)(sz / dsize); + num = sz / dsize; return num; } /* get_tensor_elements_num() */ @@ -239,16 +239,16 @@ static vsi_bool _try_set_const_tensor static vsi_bool _auto_cal_shape ( - uint32_t * input_shape, - uint32_t input_dim, - uint32_t * shape, - uint32_t * dim_num + vsi_size_t * input_shape, + vsi_size_t input_dim, + vsi_size_t * shape, + vsi_size_t * dim_num ) { vsi_bool ret; - int32_t neg_idx; - uint32_t i; - uint32_t total_size; + vsi_ssize_t neg_idx; + vsi_size_t i; + vsi_size_t total_size; ret = TRUE; neg_idx = -1; @@ -262,13 +262,13 @@ static vsi_bool _auto_cal_shape for( i = 0; i < *dim_num; i ++ ) { - if( -1 != (int32_t)shape[i] ) + if( -1 != (vsi_ssize_t)shape[i] ) { if (0 == shape[i]) { if (i >= input_dim) { - VSILOGE( "Wrong shape '%d' ", (int32_t)shape[i] ); + VSILOGE( "Wrong shape '%"VSI_SSIZE_T_SPECIFIER"' ", (vsi_ssize_t)shape[i] ); ret = FALSE; break; } @@ -282,7 +282,7 @@ static vsi_bool _auto_cal_shape } else { - VSILOGE( "Wrong shape '%d' ", (int32_t)shape[i] ); + VSILOGE( "Wrong shape '%"VSI_SSIZE_T_SPECIFIER"' ", (vsi_ssize_t)shape[i] ); ret = FALSE; break; } @@ -293,7 +293,7 @@ static vsi_bool _auto_cal_shape } else if(neg_idx != -1) { - shape[neg_idx] = total_size; + shape[neg_idx] = (uint32_t)total_size; } return ret; } /* _auto_cal_shape() */ @@ -309,7 +309,6 @@ static vsi_bool _init_tensor vx_tensor_create_params_t params; float * scales = NULL; int32_t * null_zp = NULL; - ret = TRUE; memset( ¶ms, 0, sizeof( vx_tensor_create_params_t ) ); @@ -363,14 +362,14 @@ static vsi_bool _init_tensor if( TRUE == tensor->attr.is_created_from_handle ) { vx_tensor_addressing addr; - uint32_t stride_size[VSI_NN_MAX_DIM_NUM]; - uint32_t buf_sz; + vsi_size_t stride_size[VSI_NN_MAX_DIM_NUM]; + vsi_size_t buf_sz; buf_sz = vsi_nn_GetStrideSize( &tensor->attr, stride_size ); if( buf_sz > 0 ) { - uint32_t align_start_size = graph->handle_manager.align_start_size; - uint32_t align_block_size = graph->handle_manager.align_block_size; + vsi_size_t align_start_size = graph->handle_manager.align_start_size; + vsi_size_t align_block_size = graph->handle_manager.align_block_size; if (data == NULL) { data = vsi_nn_MallocAlignedBuffer(buf_sz, align_start_size, @@ -400,8 +399,25 @@ static vsi_bool _init_tensor } if( data ) { +#ifdef VSI_40BIT_VA_SUPPORT addr = vxCreateTensorAddressing(graph->ctx->c, - tensor->attr.size, stride_size, (uint8_t)tensor->attr.dim_num); + tensor->attr.size, stride_size, (vsi_size_t)tensor->attr.dim_num); +#else + { + uint32_t i, size_32bit[_cnt_of_array(tensor->attr.size)] = {0}; + uint32_t stride_size_32bit[_cnt_of_array(stride_size)] = {0}; + for(i = 0; i < _cnt_of_array(tensor->attr.size); i++) + { + size_32bit[i] = (uint32_t)tensor->attr.size[i]; + } + for(i = 0; i < _cnt_of_array(stride_size); i++) + { + stride_size_32bit[i] = (uint32_t)stride_size[i]; + } + addr = vxCreateTensorAddressing(graph->ctx->c, + size_32bit, stride_size_32bit, (uint8_t)tensor->attr.dim_num); + } +#endif #ifdef VX_CREATE_TENSOR_SUPPORT_PHYSICAL #ifdef VX_13_NN_COMPATIBLITY tensor->t = vxCreateTensorFromHandle2(graph->ctx->c, @@ -567,16 +583,16 @@ vsi_nn_tensor_t * vsi_nn_CreateTensorWithDefault vsi_nn_tensor_t* t = vsi_nn_CreateTensor( graph, attr ); if( t ) { - uint32_t size = 0; - uint32_t stride[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t size = 0; + vsi_size_t stride[VSI_NN_MAX_DIM_NUM] = { 0 }; uint8_t* data = NULL; size = vsi_nn_GetStrideSize( &t->attr, stride ); data = (uint8_t *)malloc( size ); if( data ) { - uint32_t i = 0, j = 0; - uint32_t elements = size / stride[0]; + vsi_size_t i = 0, j = 0; + vsi_size_t elements = size / stride[0]; vsi_status status = VSI_FAILURE; status = vsi_nn_Float32ToDtype( defualt_value, &data[0], &t->attr.dtype ); @@ -618,16 +634,16 @@ vsi_status vsi_nn_FillTensorWithValue if( tensor ) { - uint32_t size = 0; - uint32_t stride[VSI_NN_MAX_DIM_NUM] = { 0 }; + vsi_size_t size = 0; + vsi_size_t stride[VSI_NN_MAX_DIM_NUM] = { 0 }; uint8_t* data = NULL; size = vsi_nn_GetStrideSize( &tensor->attr, stride ); data = (uint8_t *)malloc( size ); if( data ) { - uint32_t i = 0, j = 0; - uint32_t elements = size / stride[0]; + vsi_size_t i = 0, j = 0; + vsi_size_t elements = size / stride[0]; status = vsi_nn_Float32ToDtype( value, &data[0], &tensor->attr.dtype ); if(stride[0] == 1) @@ -775,15 +791,15 @@ vsi_status vsi_nn_QueryTensorAttr return status; } /* vsi_nn_QueryTensorAttr() */ -uint32_t vsi_nn_CopyTensorToBuffer +vsi_size_t vsi_nn_CopyTensorToBuffer ( vsi_nn_graph_t * graph, vsi_nn_tensor_t * tensor, void * buffer ) { - uint32_t sz; - uint32_t stride_size[VSI_NN_MAX_DIM_NUM]; + vsi_size_t sz; + vsi_size_t stride_size[VSI_NN_MAX_DIM_NUM]; vsi_status status; if( NULL == tensor || NULL == buffer ) { @@ -808,8 +824,9 @@ float * vsi_nn_ConvertTensorToFloat32Data { vsi_status status; uint8_t *tensor_data = NULL; - uint32_t elements; - uint32_t i,stride; + vsi_size_t elements; + vsi_size_t i; + uint32_t stride; float *data; if(NULL == graph || NULL == tensor) @@ -866,8 +883,8 @@ uint8_t * vsi_nn_ConvertTensorToData ) { uint8_t * data; - uint32_t buf_sz; - uint32_t stride_size[VSI_NN_MAX_DIM_NUM]; + vsi_size_t buf_sz; + vsi_size_t stride_size[VSI_NN_MAX_DIM_NUM]; vsi_status status; if( NULL == tensor ) { @@ -925,16 +942,16 @@ uint8_t * vsi_nn_ConvertRawTensorToData ( vx_context context, vx_tensor tensor, - uint32_t * dim, + vsi_size_t * dim, vx_enum * data_format, - uint32_t * size, - uint32_t * stride_size, + vsi_size_t * size, + vsi_size_t * stride_size, vx_tensor_addressing * addr, vx_enum accessor ) { uint8_t * data; - uint32_t buf_sz; + vsi_size_t buf_sz; vsi_status status; vsi_nn_tensor_attr_t attr; if( NULL == tensor || NULL == context ) @@ -945,11 +962,11 @@ uint8_t * vsi_nn_ConvertRawTensorToData status = VSI_FAILURE; data = NULL; - status = vxQueryTensor(tensor, VX_TENSOR_NUM_OF_DIMS, dim, sizeof(uint32_t)); - status = vxQueryTensor(tensor, VX_TENSOR_DIMS, size, sizeof(uint32_t) * (*dim)); + status = vxQueryTensor(tensor, VX_TENSOR_NUM_OF_DIMS, dim, sizeof(vsi_size_t)); + status = vxQueryTensor(tensor, VX_TENSOR_DIMS, size, sizeof(vsi_size_t) * (*dim)); status = vxQueryTensor(tensor, VX_TENSOR_DATA_TYPE, data_format, sizeof(vsi_enum)); - attr.dim_num = *dim; - memcpy(attr.size, size, sizeof(uint32_t) * attr.dim_num); + attr.dim_num = (uint32_t)(*dim); + memcpy(attr.size, size, sizeof(vsi_size_t) * attr.dim_num); buf_sz = vsi_nn_GetStrideSizeBySize(size, *dim, *data_format, stride_size); // TODO: Fix this to use copy tensor to buffer @@ -984,13 +1001,13 @@ uint8_t * vsi_nn_ConvertRawTensorToData2 vx_context context, vx_tensor tensor, vsi_nn_tensor_attr_t * attr, - uint32_t * stride_size, + vsi_size_t * stride_size, vx_tensor_addressing * addr, vx_enum accessor ) { uint8_t * data; - uint32_t buf_sz; + vsi_size_t buf_sz; vsi_status status; if( NULL == tensor || NULL == context ) @@ -1002,9 +1019,9 @@ uint8_t * vsi_nn_ConvertRawTensorToData2 data = NULL; status = vxQueryTensor(tensor, VX_TENSOR_NUM_OF_DIMS, - &(attr->dim_num), sizeof(uint32_t)); + &(attr->dim_num), sizeof(attr->dim_num)); status = vxQueryTensor(tensor, VX_TENSOR_DIMS, - attr->size, sizeof(uint32_t) * (attr->dim_num)); + attr->size, sizeof(attr->size[0]) * (attr->dim_num)); status = vxQueryTensor(tensor, VX_TENSOR_DATA_TYPE, &(attr->dtype.vx_type), sizeof(vsi_enum)); status = vxQueryTensor(tensor, VX_TENSOR_QUANT_FORMAT, @@ -1064,8 +1081,8 @@ void vsi_nn_SaveTensorToTextByFp32 uint8_t buf[_TENSOR_TMPBUF_SZ]; FILE * fp; float write_data; - uint32_t sz; - uint32_t i; + vsi_size_t sz; + vsi_size_t i; uint32_t count; if( NULL == graph || NULL == tensor || NULL == filename ) @@ -1122,7 +1139,7 @@ void vsi_nn_SaveTensorToText ) { uint8_t * data; - uint32_t sz; + vsi_size_t sz; if( NULL == graph || NULL == tensor || NULL == filename ) { @@ -1146,7 +1163,7 @@ void vsi_nn_SaveDataToText ( const char * filename, uint8_t * data, - uint32_t data_size, + vsi_size_t data_size, vsi_nn_type_e type, char * seperator ) @@ -1157,7 +1174,7 @@ void vsi_nn_SaveDataToText FILE * fp; float write_data; uint32_t type_bytes; - uint32_t i; + vsi_size_t i; uint32_t count; if( NULL == filename ) @@ -1216,7 +1233,7 @@ void vsi_nn_SaveTensorToBinary { uint8_t * data; FILE * fp; - uint32_t sz; + vsi_size_t sz; uint32_t i; if( NULL == graph || NULL == tensor || NULL == filename ) @@ -1237,7 +1254,7 @@ void vsi_nn_SaveTensorToBinary VSILOGW( "Write file %s fail. Please check...", filename ); return; } - sz = vsi_nn_GetTypeBytes( tensor->attr.dtype.vx_type ); + sz = (vsi_size_t)vsi_nn_GetTypeBytes( tensor->attr.dtype.vx_type ); for( i = 0; i < tensor->attr.dim_num; i ++ ) { sz *= tensor->attr.size[i]; @@ -1354,9 +1371,9 @@ vsi_status vsi_nn_CopyRawDataToTensor ) { vsi_status status = VSI_FAILURE; - uint32_t src_data_sz = 0; + vsi_size_t src_data_sz = 0; uint8_t* buffer = NULL; - uint32_t target_tensor_size = 0; /* in bytes */ + vsi_size_t target_tensor_size = 0; /* in bytes */ src_data_sz = vsi_nn_GetElementNum(tensor) * vsi_nn_GetTypeBytes(src_dtype->vx_type); target_tensor_size = vsi_nn_GetTensorSize( tensor->attr.size, tensor->attr.dim_num, tensor->attr.dtype.vx_type ); @@ -1377,14 +1394,14 @@ vsi_bool vsi_nn_CalcReshapeTensor ( vsi_nn_tensor_t * input, vsi_nn_tensor_t * output, - uint32_t * shape, - uint32_t dim_num + vsi_size_t * shape, + vsi_size_t dim_num ) { vsi_bool ret; uint32_t i; - uint32_t total_size; - uint32_t dst_size; + vsi_size_t total_size; + vsi_size_t dst_size; if( NULL == input || NULL == output || NULL == shape || 0 == dim_num ) @@ -1404,7 +1421,7 @@ vsi_bool vsi_nn_CalcReshapeTensor dst_size = vsi_nn_ShapeProduct( shape, dim_num ); if( total_size != dst_size ) { - VSILOGE( "Cannot calculate the reshape tensor %u to %u.", + VSILOGE( "Cannot calculate the reshape tensor %"VSI_SIZE_T_SPECIFIER" to %"VSI_SIZE_T_SPECIFIER".", total_size, dst_size ); return FALSE; } @@ -1417,7 +1434,7 @@ vsi_bool vsi_nn_CalcReshapeTensor { output->attr.size[i] = shape[i]; } - output->attr.dim_num = dim_num; + output->attr.dim_num = (uint32_t)dim_num; } } @@ -1432,8 +1449,8 @@ vsi_nn_tensor_t *vsi_nn_reshape_tensor ( vsi_nn_graph_t * graph, vsi_nn_tensor_t * input, - uint32_t * shape, - uint32_t dim_num + vsi_size_t * shape, + vsi_size_t dim_num ) { vsi_bool ret; @@ -1470,13 +1487,13 @@ vsi_bool vsi_nn_ReshapeTensor vsi_nn_graph_t * graph, vsi_nn_tensor_t * input, vsi_nn_tensor_t * output, - const uint32_t * shape, - uint32_t dim_num + const vsi_size_t * shape, + vsi_size_t dim_num ) { vsi_bool ret; - uint32_t new_shape[VSI_NN_MAX_DIM_NUM] = {0}; - memcpy(new_shape, shape, sizeof(uint32_t) * dim_num); + vsi_size_t new_shape[VSI_NN_MAX_DIM_NUM] = {0}; + memcpy(new_shape, shape, sizeof(vsi_size_t) * dim_num); ret = TRUE; ret = vsi_nn_CalcReshapeTensor(input, output, new_shape, dim_num); @@ -1498,7 +1515,18 @@ vsi_bool vsi_nn_ReshapeTensor } /* Create reshape tensor */ - output->t = vxReshapeTensor( input->t, (int32_t *)new_shape, dim_num ); +#ifdef VSI_40BIT_VA_SUPPORT + output->t = vxReshapeTensor( input->t, new_shape, dim_num ); +#else + { + uint32_t i, new_shape_32bit[VSI_NN_MAX_DIM_NUM] = {0}; + for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + new_shape_32bit[i] = (uint32_t)new_shape[i]; + } + output->t = vxReshapeTensor( input->t, (int32_t *)new_shape_32bit, (uint32_t)dim_num ); + } +#endif if( NULL == output->t ) { ret = FALSE; @@ -1516,16 +1544,16 @@ void vsi_nn_TransposeTensor ( vsi_nn_graph_t * graph, vsi_nn_tensor_t * tensor, - uint32_t * perm, - uint32_t dim_num, - uint32_t * as_shape + vsi_size_t * perm, + vsi_size_t dim_num, + vsi_size_t * as_shape ) { uint8_t * buf; uint8_t * dst; - uint32_t buf_sz; - uint32_t tensor_sz; - uint32_t * shape_ptr; + vsi_size_t buf_sz; + vsi_size_t tensor_sz; + vsi_size_t * shape_ptr; vsi_status status; if( NULL == tensor || NULL == perm || 0 == dim_num ) @@ -1572,15 +1600,15 @@ void vsi_nn_PermuteTensor ( vsi_nn_graph_t * graph, vsi_nn_tensor_t * tensor, - uint32_t * perm, - uint32_t dim_num + vsi_size_t * perm, + vsi_size_t dim_num ) { uint8_t * buf = NULL; uint8_t * dst = NULL; - uint32_t tensor_sz; - uint32_t * shape_ptr; - uint32_t dst_shape[VSI_NN_MAX_DIM_NUM] = {0}; + vsi_size_t tensor_sz; + vsi_size_t * shape_ptr; + vsi_size_t dst_shape[VSI_NN_MAX_DIM_NUM] = {0}; uint32_t i; vsi_status status; @@ -1621,7 +1649,11 @@ void vsi_nn_PermuteTensor } vsi_nn_Permute( dst, buf, shape_ptr, dim_num, perm, tensor->attr.dtype.vx_type ); memcpy(tensor->attr.size, dst_shape, sizeof(dst_shape)); - tensor->t = vxReshapeTensor(tensor->t, (int32_t *)tensor->attr.size, tensor->attr.dim_num); +#ifdef VSI_40BIT_VA_SUPPORT + tensor->t = vxReshapeTensor(tensor->t, tensor->attr.size, tensor->attr.dim_num); +#else + tensor->t = vxReshapeTensor(tensor->t, (int32_t*)tensor->attr.size, tensor->attr.dim_num); +#endif status = vsi_nn_CopyDataToTensor( graph, tensor, dst ); if( VSI_SUCCESS != status ) { @@ -1632,7 +1664,7 @@ void vsi_nn_PermuteTensor if( dst ) { free(dst); dst = NULL; } } /* vsi_nn_PermuteTensor() */ -uint32_t vsi_nn_GetElementNum +vsi_size_t vsi_nn_GetElementNum ( const vsi_nn_tensor_t * tensor ) @@ -1646,15 +1678,15 @@ uint32_t vsi_nn_GetElementNum tensor->attr.dim_num, tensor->attr.dtype.vx_type); } /* vsi_nn_GetElementNum() */ -uint32_t vsi_nn_GetTensorSize +vsi_size_t vsi_nn_GetTensorSize ( - const uint32_t * shape, - uint32_t dim_num, + const vsi_size_t * shape, + vsi_size_t dim_num, vsi_nn_type_e type ) { - uint32_t sz; - uint32_t i; + vsi_size_t sz; + vsi_size_t i; sz = 0; if( NULL == shape || 0 == dim_num ) { @@ -1769,8 +1801,8 @@ void vsi_nn_PrintTensor vx_tensor vsi_nn_CreateViewTensor ( vsi_nn_graph_t *graph, - uint32_t *start, - uint32_t *end, + vsi_size_t *start, + vsi_size_t *end, vsi_nn_tensor_t *tensor ) { @@ -1867,7 +1899,7 @@ vsi_nn_tensor_rel_t *vsi_nn_CreateTensorRelevance vsi_nn_tensor_rel_t *tensor_ref; vsi_nn_node_t *node; -#define _MAX_TENSOR_IO 32 +#define _MAX_TENSOR_IO 128 max_io = _MAX_TENSOR_IO; tensor_num = graph->tensor_num; tensor_ref = _init_tensor_rel_buffer(graph, max_io); @@ -1927,8 +1959,8 @@ vsi_status vsi_nn_SwapTensorHandle vsi_nn_tensor_t * tensor1 ) { - uint32_t stride_size[VSI_NN_MAX_DIM_NUM]; - uint32_t buf_sz0, buf_sz1; + vsi_size_t stride_size[VSI_NN_MAX_DIM_NUM]; + vsi_size_t buf_sz0, buf_sz1; vsi_status status = VSI_FAILURE; if( NULL == tensor0 || NULL == tensor1 ) @@ -1962,7 +1994,7 @@ vsi_status vsi_nn_SwapTensorHandle return status; } /* vsi_nn_SwapTensorHandle() */ -uint32_t vsi_nn_vxGetTensorElementNum +vsi_size_t vsi_nn_vxGetTensorElementNum ( vsi_nn_tensor_attr_t *attr ) @@ -1990,10 +2022,10 @@ vsi_status vsi_nn_vxGetTensorAttr } status = vxQueryTensor(tensor, VX_TENSOR_NUM_OF_DIMS, - &(attr->dim_num), sizeof(uint32_t)); + &(attr->dim_num), sizeof(attr->dim_num)); TEST_CHECK_STATUS( status, final ); status = vxQueryTensor(tensor, VX_TENSOR_DIMS, - attr->size, sizeof(uint32_t) * (attr->dim_num)); + attr->size, sizeof(attr->size[0]) * (attr->dim_num)); TEST_CHECK_STATUS( status, final ); status = vxQueryTensor(tensor, VX_TENSOR_DATA_TYPE, &(attr->dtype.vx_type), sizeof(vsi_enum)); @@ -2033,10 +2065,10 @@ uint8_t *vsi_nn_vxCopyTensorToData { uint8_t *data; vsi_status status; - uint32_t buf_sz; - uint32_t stride_size[VSI_NN_MAX_DIM_NUM]; + vsi_size_t buf_sz; + vsi_size_t stride_size[VSI_NN_MAX_DIM_NUM]; - memset(stride_size, 0, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); + memset(stride_size, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM); if(NULL == tensor || NULL == context || NULL == attr) { return NULL; @@ -2073,7 +2105,7 @@ vsi_status vsi_nn_vxCopyDataToTensor ) { vsi_status status; - uint32_t stride_size[VSI_NN_MAX_DIM_NUM]; + vsi_size_t stride_size[VSI_NN_MAX_DIM_NUM]; status = VSI_FAILURE; if(NULL == tensor || NULL == attr || @@ -2082,7 +2114,7 @@ vsi_status vsi_nn_vxCopyDataToTensor return status; } - memset(stride_size, 0, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); + memset(stride_size, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM); vsi_nn_GetStrideSize(attr, stride_size); status = vsi_nn_copy_tensor_patch(tensor, attr, data, VX_WRITE_ONLY); if(VSI_SUCCESS != status) @@ -2097,9 +2129,9 @@ vsi_status vsi_nn_copy_tensor_veiw_patch vx_tensor tensor, vsi_nn_tensor_attr_t *attr, void *user_ptr, - uint32_t *start, - uint32_t *end, - uint32_t *stride, + vsi_size_t *start, + vsi_size_t *end, + vsi_size_t *stride, vsi_enum usage, vsi_enum user_memory_type ) @@ -2127,7 +2159,7 @@ vsi_status vsi_nn_copy_tensor_veiw_patch { vx_context context = NULL; vx_tensor_addressing addr = NULL; - uint32_t stride_size[VSI_NN_MAX_DIM_NUM]; + size_t stride_size[VSI_NN_MAX_DIM_NUM]; vsi_nn_tensor_attr_t t; memset(vstart, 0, sizeof(size_t) * VSI_NN_MAX_DIM_NUM); @@ -2142,7 +2174,7 @@ vsi_status vsi_nn_copy_tensor_veiw_patch return status; } addr = vxCreateTensorAddressing( context, attr->size, - stride_size, attr->dim_num ); + (vx_uint32*)stride_size, attr->dim_num ); if( NULL == addr ) { VSILOGE("Call vxCreateTensorAddressing fail"); @@ -2174,28 +2206,32 @@ vsi_status vsi_nn_copy_tensor_patch vsi_enum usage ) { - uint32_t start[VSI_NN_MAX_DIM_NUM],end[VSI_NN_MAX_DIM_NUM],stride[VSI_NN_MAX_DIM_NUM]; + vsi_size_t start[VSI_NN_MAX_DIM_NUM],end[VSI_NN_MAX_DIM_NUM],stride[VSI_NN_MAX_DIM_NUM]; vsi_status status = VSI_FAILURE; + uint32_t i; if(NULL == tensor || NULL == user_ptr) { VSILOGE("Invalid parameter"); return status; } vsi_nn_GetStrideSize(attr, stride); - memset(start, 0, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); - memcpy(end, attr->size, sizeof(uint32_t) * VSI_NN_MAX_DIM_NUM); + memset(start, 0, sizeof(vsi_size_t) * VSI_NN_MAX_DIM_NUM); + for(i = 0; i < VSI_NN_MAX_DIM_NUM; i++) + { + end[i] = attr->size[i]; + } status = vsi_nn_copy_tensor_veiw_patch(tensor, attr, user_ptr, start, end, stride, usage, 0); return status; } /* vsi_nn_copy_tensor_patch() */ -uint32_t vsi_nn_GetOffsetByCoords +vsi_size_t vsi_nn_GetOffsetByCoords ( vsi_nn_tensor_attr_t *attr, uint32_t *coords ) { - uint32_t i, res = 0, strides = 1; - for (i = 0; i < attr->dim_num; i++) + vsi_size_t i, res = 0, strides = 1; + for (i = 0; i < (vsi_size_t)attr->dim_num; i++) { res += coords[i] * strides; strides *= attr->size[i]; @@ -2209,15 +2245,15 @@ void vsi_nn_reshuffle_weight_data vsi_nn_tensor_t * weights ) { - int32_t b, sy, sx, c, h, w; + vsi_ssize_t b, sy, sx, c, h, w; uint8_t* weight_data = NULL; uint8_t* reshuffled_weights = NULL; uint8_t* buffer = NULL; - int32_t kernel_size_x = weights->attr.size[0]; - int32_t kernel_size_y = weights->attr.size[1]; - int32_t weight_size_c = weights->attr.size[2]; - int32_t weight_size_b = weights->attr.size[3]; - int32_t slice_size = kernel_size_x * kernel_size_y; + vsi_ssize_t kernel_size_x = weights->attr.size[0]; + vsi_ssize_t kernel_size_y = weights->attr.size[1]; + vsi_ssize_t weight_size_c = weights->attr.size[2]; + vsi_ssize_t weight_size_b = weights->attr.size[3]; + vsi_ssize_t slice_size = kernel_size_x * kernel_size_y; int32_t item_size = vsi_nn_TypeGetBytes(weights->attr.dtype.vx_type); weight_data = vsi_nn_ConvertTensorToData(graph, weights); @@ -2254,7 +2290,7 @@ void vsi_nn_reshuffle_weight_data for (w = 0; w < kernel_size_x; w++) { uint8_t* reshuffled_output = weight_output + (h * kernel_size_x + w) * item_size; - int32_t input_index = ((kernel_size_y - 1 - h) + sy) * kernel_size_x + + vsi_ssize_t input_index = ((kernel_size_y - 1 - h) + sy) * kernel_size_x + ((kernel_size_x - 1 - w) + sx); memcpy(reshuffled_output, data + input_index * item_size, item_size); @@ -2365,10 +2401,10 @@ vsi_bool vsi_nn_ConvertTensor { vsi_bool ret = TRUE; uint8_t* src_buf = NULL; - uint32_t sz = 0; + vsi_size_t sz = 0; uint32_t src_stride = 0; uint32_t dst_stride = 0; - uint32_t dst_buf_sz = 0; + vsi_size_t dst_buf_sz = 0; uint8_t* dst_buf = NULL; if( NULL == graph || NULL == input || NULL == output ) @@ -2391,7 +2427,7 @@ vsi_bool vsi_nn_ConvertTensor if ( dst_buf ) { - uint32_t i = 0; + vsi_size_t i = 0; vsi_status status = VSI_SUCCESS; for ( i = 0; i < sz; i ++ ) @@ -2418,4 +2454,4 @@ vsi_bool vsi_nn_ConvertTensor vsi_nn_safe_free( dst_buf ); return ret; -} \ No newline at end of file +}