Added general Float16 support (#631)
Added Float16 type definition from third-party Refine float16 bias handlling in conv2d Refine float16 case in conv2d Caution: Headers of float16 only be included when build unit_test Type: New Feature Signed-off-by: Feiyue Chen <Feiyue.Chen@verisilicon.com>
This commit is contained in:
parent
35e50d7692
commit
af50cc5e3f
2
BUILD
2
BUILD
|
|
@ -134,8 +134,10 @@ cc_binary(
|
||||||
cc_test (
|
cc_test (
|
||||||
name = "unit_test",
|
name = "unit_test",
|
||||||
copts = ["-std=c++14", "-Werror"],
|
copts = ["-std=c++14", "-Werror"],
|
||||||
|
includes = ["third_party/half"],
|
||||||
srcs = [
|
srcs = [
|
||||||
"src/tim/vx/test_utils.h",
|
"src/tim/vx/test_utils.h",
|
||||||
|
"third_party/half/half.hpp"
|
||||||
] + glob(["src/tim/**/*_test.cc"]),
|
] + glob(["src/tim/**/*_test.cc"]),
|
||||||
deps = [
|
deps = [
|
||||||
"@gtest//:gtest",
|
"@gtest//:gtest",
|
||||||
|
|
|
||||||
|
|
@ -98,6 +98,8 @@ if(TIM_VX_ENABLE_TEST)
|
||||||
FetchContent_Populate(googletest)
|
FetchContent_Populate(googletest)
|
||||||
add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR})
|
add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
include_directories(third_party/half)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(TIM_VX_ENABLE_GRPC)
|
if(TIM_VX_ENABLE_GRPC)
|
||||||
|
|
|
||||||
|
|
@ -99,12 +99,9 @@ class Conv2d : public BuiltinOp {
|
||||||
const int32_t multiplier_;
|
const int32_t multiplier_;
|
||||||
const DataLayout kernel_layout_;
|
const DataLayout kernel_layout_;
|
||||||
|
|
||||||
#if defined(__clang__) && (__clang_major__ >= 15)
|
|
||||||
#define TIM_VX_OPS_CONV2D_WITH_F16BIAS 1
|
|
||||||
private:
|
private:
|
||||||
void OnBindInputPostProc(const std::shared_ptr<Tensor>& tensor,
|
void OnBindInputPostProc(const std::shared_ptr<Tensor>& tensor,
|
||||||
int32_t input_idx) override;
|
int32_t input_idx) override;
|
||||||
#endif
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace ops
|
} // namespace ops
|
||||||
|
|
|
||||||
|
|
@ -42,8 +42,8 @@ Conv2d::Conv2d(Graph* graph, const std::array<uint32_t, 4> pad,
|
||||||
const std::array<uint32_t, 2>& stride,
|
const std::array<uint32_t, 2>& stride,
|
||||||
const std::array<uint32_t, 2>& dilation, int32_t multiplier,
|
const std::array<uint32_t, 2>& dilation, int32_t multiplier,
|
||||||
DataLayout input_layout, DataLayout kernel_layout)
|
DataLayout input_layout, DataLayout kernel_layout)
|
||||||
: Conv2d(graph, 0, PadType::AUTO, {0, 0}, stride, dilation, pad,
|
: Conv2d(graph, 0, PadType::AUTO, {0, 0}, stride, dilation, pad, multiplier,
|
||||||
multiplier, input_layout, kernel_layout) {}
|
input_layout, kernel_layout) {}
|
||||||
|
|
||||||
Conv2d::Conv2d(Graph* graph, int32_t weights, PadType padding,
|
Conv2d::Conv2d(Graph* graph, int32_t weights, PadType padding,
|
||||||
const std::array<uint32_t, 2>& ksize,
|
const std::array<uint32_t, 2>& ksize,
|
||||||
|
|
@ -88,41 +88,33 @@ std::shared_ptr<Operation> Conv2d::Clone(std::shared_ptr<Graph>& graph) const {
|
||||||
this->kernel_layout_);
|
this->kernel_layout_);
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::vector<std::shared_ptr<Tensor>> Conv2d::ConstantInputsTensor() const {
|
const std::vector<std::shared_ptr<Tensor>> Conv2d::ConstantInputsTensor()
|
||||||
if (this->IsAllInputsConst()) {
|
const {
|
||||||
|
if (this->IsAllInputsConst()) {
|
||||||
return {this->impl_->inputs_tensor_[0]};
|
return {this->impl_->inputs_tensor_[0]};
|
||||||
} else {
|
} else {
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle float16 bias if clang compiler is no less than 15.0.0 version
|
// Handle float16 bias
|
||||||
#ifdef TIM_VX_OPS_CONV2D_WITH_F16BIAS
|
|
||||||
void Conv2d::OnBindInputPostProc(const std::shared_ptr<Tensor>& tensor,
|
void Conv2d::OnBindInputPostProc(const std::shared_ptr<Tensor>& tensor,
|
||||||
int32_t input_idx) {
|
int32_t input_idx) {
|
||||||
if (tensor->GetDataType() == vx::DataType::FLOAT16 &&
|
if (tensor->GetDataType() == vx::DataType::FLOAT16 &&
|
||||||
tensor->IsConstTensor() && impl_->inputs_tensor_.size() == 3) {
|
tensor->IsConstTensor() && impl_->inputs_tensor_.size() == 3) {
|
||||||
uint32_t bias_size = 1;
|
float* float32_bias = tensor->ConvertTensorToFloat32Data();
|
||||||
for (auto i : tensor->GetShape()) {
|
|
||||||
bias_size *= i;
|
|
||||||
}
|
|
||||||
std::vector<_Float16> in(bias_size);
|
|
||||||
tensor->CopyDataFromTensor(in.data());
|
|
||||||
|
|
||||||
std::vector<float> out(bias_size);
|
|
||||||
for (uint i = 0; i < bias_size; i++) {
|
|
||||||
out[i] = static_cast<float>(in[i]);
|
|
||||||
}
|
|
||||||
TensorSpec fp32bias_spec(tim::vx::DataType::FLOAT32, tensor->GetShape(),
|
TensorSpec fp32bias_spec(tim::vx::DataType::FLOAT32, tensor->GetShape(),
|
||||||
tim::vx::TensorAttribute::CONSTANT);
|
tim::vx::TensorAttribute::CONSTANT);
|
||||||
auto out_tensor = impl_->graph_->CreateTensor(fp32bias_spec, out.data());
|
|
||||||
|
auto out_tensor = impl_->graph_->CreateTensor(fp32bias_spec, float32_bias);
|
||||||
|
vsi_nn_Free(float32_bias);
|
||||||
|
|
||||||
impl_->inputs_tensor_[2] = out_tensor;
|
impl_->inputs_tensor_[2] = out_tensor;
|
||||||
impl_->node()->input.tensors[input_idx] = out_tensor->GetId();
|
impl_->node()->input.tensors[input_idx] = out_tensor->GetId();
|
||||||
impl_->graph_->RenewTensorConsumersMap(tensor, out_tensor, this);
|
impl_->graph_->RenewTensorConsumersMap(tensor, out_tensor, this);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
} // namespace ops
|
} // namespace ops
|
||||||
} // namespace vx
|
} // namespace vx
|
||||||
|
|
|
||||||
|
|
@ -28,11 +28,12 @@
|
||||||
#include "tim/vx/context.h"
|
#include "tim/vx/context.h"
|
||||||
#include "tim/vx/graph.h"
|
#include "tim/vx/graph.h"
|
||||||
#include "tim/vx/types.h"
|
#include "tim/vx/types.h"
|
||||||
|
#include "third_party/half/half.hpp"
|
||||||
|
|
||||||
#ifdef TIM_VX_OPS_CONV2D_WITH_F16BIAS
|
|
||||||
TEST(Conv2d, shape_4_2_1_1_float16_PaddingTest) {
|
TEST(Conv2d, shape_4_2_1_1_float16_PaddingTest) {
|
||||||
auto ctx = tim::vx::Context::Create();
|
auto ctx = tim::vx::Context::Create();
|
||||||
auto graph = ctx->CreateGraph();
|
auto graph = ctx->CreateGraph();
|
||||||
|
using namespace half_float::literal;
|
||||||
|
|
||||||
tim::vx::ShapeType input_shape({4, 2, 1, 1}); //whcn
|
tim::vx::ShapeType input_shape({4, 2, 1, 1}); //whcn
|
||||||
tim::vx::ShapeType weight_shape({2, 2, 1, 3}); //whio
|
tim::vx::ShapeType weight_shape({2, 2, 1, 3}); //whio
|
||||||
|
|
@ -50,26 +51,29 @@ TEST(Conv2d, shape_4_2_1_1_float16_PaddingTest) {
|
||||||
tim::vx::TensorAttribute::OUTPUT);
|
tim::vx::TensorAttribute::OUTPUT);
|
||||||
|
|
||||||
// Input data nchw
|
// Input data nchw
|
||||||
std::vector<_Float16> input_data = {
|
|
||||||
1, 1, 1, 1, // row = 1
|
std::vector<half_float::half> input_data = {
|
||||||
2, 2, 3, 2 // row = 2
|
1.0_h, 1.0_h, 1.0_h, 1.0_h, // row = 1
|
||||||
|
2.0_h, 2.0_h, 3.0_h, 2.0_h // row = 2
|
||||||
};
|
};
|
||||||
|
|
||||||
// weight data oihw
|
// weight data oihw
|
||||||
std::vector<_Float16> weight_data = {
|
std::vector<half_float::half> weight_data = {
|
||||||
1, 2, 3, 4, //first 2x2 filter
|
1.0_h, 2.0_h, 3.0_h, 4.0_h, //first 2x2 filter
|
||||||
-1, 1, -1, 1, // second 2x2 filter
|
-1.0_h, 1.0_h, -1.0_h, 1.0_h, // second 2x2 filter
|
||||||
-1, -1, 1, 1, // third 2x2 filter
|
-1.0_h, -1.0_h, 1.0_h, 1.0_h, // third 2x2 filter
|
||||||
};
|
};
|
||||||
|
|
||||||
// bias data
|
// bias data
|
||||||
std::vector<_Float16> bias_data = {1, 2, 3};
|
std::vector<half_float::half> bias_data = {1.0_h, 2.0_h, 3.0_h};
|
||||||
|
|
||||||
// nchw
|
std::vector<half_float::half> golden = {
|
||||||
std::vector<_Float16> golden = {// first channel
|
// first channel
|
||||||
18, 22, 21, 8, 7, 9, 8, 3, 2, 3, 1, -1,
|
18.0_h, 22.0_h, 21.0_h, 8.0_h, 7.0_h, 9.0_h, 8.0_h, 3.0_h, 2.0_h, 3.0_h,
|
||||||
// second channel
|
1.0_h, -1.0_h,
|
||||||
2, 3, 1, 0, 5, 6, 6, 4, -1, -2, -2, 1};
|
// second channel
|
||||||
|
2.0_h, 3.0_h, 1.0_h, 0.0_h, 5.0_h, 6.0_h, 6.0_h, 4.0_h, -1.0_h, -2.0_h,
|
||||||
|
-2.0_h, 1.0_h};
|
||||||
|
|
||||||
auto input_tensor = graph->CreateTensor(input_spec);
|
auto input_tensor = graph->CreateTensor(input_spec);
|
||||||
auto weight_tensor = graph->CreateTensor(weight_spec, weight_data.data());
|
auto weight_tensor = graph->CreateTensor(weight_spec, weight_data.data());
|
||||||
|
|
@ -80,8 +84,8 @@ TEST(Conv2d, shape_4_2_1_1_float16_PaddingTest) {
|
||||||
std::array<uint32_t, 2> stride({1, 1});
|
std::array<uint32_t, 2> stride({1, 1});
|
||||||
std::array<uint32_t, 2> dilation({0, 0});
|
std::array<uint32_t, 2> dilation({0, 0});
|
||||||
|
|
||||||
auto conv2d = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
auto conv2d =
|
||||||
padding, stride, dilation);
|
graph->CreateOperation<tim::vx::ops::Conv2d>(padding, stride, dilation);
|
||||||
(*conv2d)
|
(*conv2d)
|
||||||
.BindInput(input_tensor)
|
.BindInput(input_tensor)
|
||||||
.BindInput(weight_tensor)
|
.BindInput(weight_tensor)
|
||||||
|
|
@ -98,11 +102,10 @@ TEST(Conv2d, shape_4_2_1_1_float16_PaddingTest) {
|
||||||
for (auto i : output_tensor->GetShape()) {
|
for (auto i : output_tensor->GetShape()) {
|
||||||
output_size *= i;
|
output_size *= i;
|
||||||
}
|
}
|
||||||
std::vector<_Float16> output(output_size);
|
std::vector<half_float::half> output(output_size);
|
||||||
EXPECT_TRUE(output_tensor->CopyDataFromTensor(output.data()));
|
EXPECT_TRUE(output_tensor->CopyDataFromTensor(output.data()));
|
||||||
EXPECT_TRUE(ArraysMatch(golden, output, (_Float16)0.1));
|
EXPECT_TRUE(ArraysMatch(golden, output, (half_float::half)0.1));
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
TEST(Conv2d, shape_4_2_1_1_float32_PaddingTest) {
|
TEST(Conv2d, shape_4_2_1_1_float32_PaddingTest) {
|
||||||
auto ctx = tim::vx::Context::Create();
|
auto ctx = tim::vx::Context::Create();
|
||||||
|
|
@ -155,8 +158,8 @@ TEST(Conv2d, shape_4_2_1_1_float32_PaddingTest) {
|
||||||
std::array<uint32_t, 2> stride({1, 1});
|
std::array<uint32_t, 2> stride({1, 1});
|
||||||
std::array<uint32_t, 2> dilation({0, 0});
|
std::array<uint32_t, 2> dilation({0, 0});
|
||||||
|
|
||||||
auto conv2d = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
auto conv2d =
|
||||||
padding, stride, dilation);
|
graph->CreateOperation<tim::vx::ops::Conv2d>(padding, stride, dilation);
|
||||||
(*conv2d)
|
(*conv2d)
|
||||||
.BindInput(input_tensor)
|
.BindInput(input_tensor)
|
||||||
.BindInput(weight_tensor)
|
.BindInput(weight_tensor)
|
||||||
|
|
@ -224,8 +227,8 @@ TEST(Conv2d, shape_4_2_2_2_float32_PointwiseTest) {
|
||||||
std::array<uint32_t, 2> stride({1, 1});
|
std::array<uint32_t, 2> stride({1, 1});
|
||||||
std::array<uint32_t, 2> dilation({0, 0});
|
std::array<uint32_t, 2> dilation({0, 0});
|
||||||
|
|
||||||
auto conv2d = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
auto conv2d =
|
||||||
padding, stride, dilation);
|
graph->CreateOperation<tim::vx::ops::Conv2d>(padding, stride, dilation);
|
||||||
(*conv2d)
|
(*conv2d)
|
||||||
.BindInput(input_tensor)
|
.BindInput(input_tensor)
|
||||||
.BindInput(weight_tensor)
|
.BindInput(weight_tensor)
|
||||||
|
|
@ -295,8 +298,8 @@ TEST(Conv2d, shape_4_2_1_2_float32_SimpleTest) {
|
||||||
std::array<uint32_t, 2> stride({2, 2});
|
std::array<uint32_t, 2> stride({2, 2});
|
||||||
std::array<uint32_t, 2> dilation({0, 0});
|
std::array<uint32_t, 2> dilation({0, 0});
|
||||||
|
|
||||||
auto conv2d = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
auto conv2d =
|
||||||
padding, stride, dilation);
|
graph->CreateOperation<tim::vx::ops::Conv2d>(padding, stride, dilation);
|
||||||
(*conv2d)
|
(*conv2d)
|
||||||
.BindInput(input_tensor)
|
.BindInput(input_tensor)
|
||||||
.BindInput(weight_tensor)
|
.BindInput(weight_tensor)
|
||||||
|
|
@ -361,8 +364,8 @@ TEST(Conv2d, shape_4_2_2_2_float32_SimpleChannelsTest) {
|
||||||
std::array<uint32_t, 2> stride({2, 2});
|
std::array<uint32_t, 2> stride({2, 2});
|
||||||
std::array<uint32_t, 2> dilation({0, 0});
|
std::array<uint32_t, 2> dilation({0, 0});
|
||||||
|
|
||||||
auto conv2d = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
auto conv2d =
|
||||||
padding, stride, dilation);
|
graph->CreateOperation<tim::vx::ops::Conv2d>(padding, stride, dilation);
|
||||||
(*conv2d)
|
(*conv2d)
|
||||||
.BindInput(input_tensor)
|
.BindInput(input_tensor)
|
||||||
.BindInput(weight_tensor)
|
.BindInput(weight_tensor)
|
||||||
|
|
@ -432,8 +435,8 @@ TEST(Conv2d, shape_6_3_1_1_float32_SimpleAnisotropicStridesTest) {
|
||||||
std::array<uint32_t, 2> stride({3, 1});
|
std::array<uint32_t, 2> stride({3, 1});
|
||||||
std::array<uint32_t, 2> dilation({0, 0});
|
std::array<uint32_t, 2> dilation({0, 0});
|
||||||
|
|
||||||
auto conv2d = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
auto conv2d =
|
||||||
padding, stride, dilation);
|
graph->CreateOperation<tim::vx::ops::Conv2d>(padding, stride, dilation);
|
||||||
(*conv2d)
|
(*conv2d)
|
||||||
.BindInput(input_tensor)
|
.BindInput(input_tensor)
|
||||||
.BindInput(weight_tensor)
|
.BindInput(weight_tensor)
|
||||||
|
|
@ -497,8 +500,8 @@ TEST(Conv2d, shape_4_3_1_1_float32_HandCalculatedTest) {
|
||||||
std::array<uint32_t, 2> stride({1, 1});
|
std::array<uint32_t, 2> stride({1, 1});
|
||||||
std::array<uint32_t, 2> dilation({0, 0});
|
std::array<uint32_t, 2> dilation({0, 0});
|
||||||
|
|
||||||
auto conv2d = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
auto conv2d =
|
||||||
padding, stride, dilation);
|
graph->CreateOperation<tim::vx::ops::Conv2d>(padding, stride, dilation);
|
||||||
(*conv2d)
|
(*conv2d)
|
||||||
.BindInput(input_tensor)
|
.BindInput(input_tensor)
|
||||||
.BindInput(weight_tensor)
|
.BindInput(weight_tensor)
|
||||||
|
|
@ -562,8 +565,8 @@ TEST(Conv2d, shape_4_3_1_1_float32_HandCalculatedConstFilterTest) {
|
||||||
std::array<uint32_t, 2> stride({1, 1});
|
std::array<uint32_t, 2> stride({1, 1});
|
||||||
std::array<uint32_t, 2> dilation({0, 0});
|
std::array<uint32_t, 2> dilation({0, 0});
|
||||||
|
|
||||||
auto conv2d = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
auto conv2d =
|
||||||
padding, stride, dilation);
|
graph->CreateOperation<tim::vx::ops::Conv2d>(padding, stride, dilation);
|
||||||
(*conv2d)
|
(*conv2d)
|
||||||
.BindInput(input_tensor)
|
.BindInput(input_tensor)
|
||||||
.BindInput(weight_tensor)
|
.BindInput(weight_tensor)
|
||||||
|
|
@ -627,8 +630,8 @@ TEST(Conv2d, shape_4_3_1_1_float32_HandCalculatedBiasTest) {
|
||||||
std::array<uint32_t, 2> stride({1, 1});
|
std::array<uint32_t, 2> stride({1, 1});
|
||||||
std::array<uint32_t, 2> dilation({0, 0});
|
std::array<uint32_t, 2> dilation({0, 0});
|
||||||
|
|
||||||
auto conv2d = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
auto conv2d =
|
||||||
padding, stride, dilation);
|
graph->CreateOperation<tim::vx::ops::Conv2d>(padding, stride, dilation);
|
||||||
(*conv2d)
|
(*conv2d)
|
||||||
.BindInput(input_tensor)
|
.BindInput(input_tensor)
|
||||||
.BindInput(weight_tensor)
|
.BindInput(weight_tensor)
|
||||||
|
|
@ -691,8 +694,8 @@ TEST(Conv2d, shape_4_3_1_1_float32_HandCalculatedValidTest) {
|
||||||
std::array<uint32_t, 2> stride({1, 1});
|
std::array<uint32_t, 2> stride({1, 1});
|
||||||
std::array<uint32_t, 2> dilation({0, 0});
|
std::array<uint32_t, 2> dilation({0, 0});
|
||||||
|
|
||||||
auto conv2d = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
auto conv2d =
|
||||||
padding, stride, dilation);
|
graph->CreateOperation<tim::vx::ops::Conv2d>(padding, stride, dilation);
|
||||||
(*conv2d)
|
(*conv2d)
|
||||||
.BindInput(input_tensor)
|
.BindInput(input_tensor)
|
||||||
.BindInput(weight_tensor)
|
.BindInput(weight_tensor)
|
||||||
|
|
@ -759,8 +762,8 @@ TEST(Conv2d, DISABLED_shape_4_2_2_2_float32_DisabledPointwiseMultifilterTest) {
|
||||||
std::array<uint32_t, 2> stride({1, 1});
|
std::array<uint32_t, 2> stride({1, 1});
|
||||||
std::array<uint32_t, 2> dilation({0, 0});
|
std::array<uint32_t, 2> dilation({0, 0});
|
||||||
|
|
||||||
auto conv2d = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
auto conv2d =
|
||||||
padding, stride, dilation);
|
graph->CreateOperation<tim::vx::ops::Conv2d>(padding, stride, dilation);
|
||||||
(*conv2d)
|
(*conv2d)
|
||||||
.BindInput(input_tensor)
|
.BindInput(input_tensor)
|
||||||
.BindInput(weight_tensor)
|
.BindInput(weight_tensor)
|
||||||
|
|
@ -827,8 +830,8 @@ TEST(Conv2d, shape_9_9_1_1_float32_SimpleDilationTest) {
|
||||||
std::array<uint32_t, 2> stride({1, 1});
|
std::array<uint32_t, 2> stride({1, 1});
|
||||||
std::array<uint32_t, 2> dilation({3, 3});
|
std::array<uint32_t, 2> dilation({3, 3});
|
||||||
|
|
||||||
auto conv2d = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
auto conv2d =
|
||||||
padding, stride, dilation);
|
graph->CreateOperation<tim::vx::ops::Conv2d>(padding, stride, dilation);
|
||||||
(*conv2d)
|
(*conv2d)
|
||||||
.BindInput(input_tensor)
|
.BindInput(input_tensor)
|
||||||
.BindInput(weight_tensor)
|
.BindInput(weight_tensor)
|
||||||
|
|
@ -893,8 +896,8 @@ TEST(Conv2d, shape_4_2_1_2_float32_StrideTest) {
|
||||||
std::array<uint32_t, 2> stride({1, 1});
|
std::array<uint32_t, 2> stride({1, 1});
|
||||||
std::array<uint32_t, 2> dilation({0, 0});
|
std::array<uint32_t, 2> dilation({0, 0});
|
||||||
|
|
||||||
auto conv2d = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
auto conv2d =
|
||||||
padding, stride, dilation);
|
graph->CreateOperation<tim::vx::ops::Conv2d>(padding, stride, dilation);
|
||||||
(*conv2d)
|
(*conv2d)
|
||||||
.BindInput(input_tensor)
|
.BindInput(input_tensor)
|
||||||
.BindInput(weight_tensor)
|
.BindInput(weight_tensor)
|
||||||
|
|
@ -958,8 +961,8 @@ TEST(Conv2d, shape_4_2_1_2_float32_InputAndFilterSameWidthHeightTest) {
|
||||||
std::array<uint32_t, 2> stride({1, 1});
|
std::array<uint32_t, 2> stride({1, 1});
|
||||||
std::array<uint32_t, 2> dilation({0, 0});
|
std::array<uint32_t, 2> dilation({0, 0});
|
||||||
|
|
||||||
auto conv2d = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
auto conv2d =
|
||||||
padding, stride, dilation);
|
graph->CreateOperation<tim::vx::ops::Conv2d>(padding, stride, dilation);
|
||||||
(*conv2d)
|
(*conv2d)
|
||||||
.BindInput(input_tensor)
|
.BindInput(input_tensor)
|
||||||
.BindInput(weight_tensor)
|
.BindInput(weight_tensor)
|
||||||
|
|
@ -1011,13 +1014,13 @@ TEST(Conv2d, shape_4_2_1_2_uint8_QuantizedTest1) {
|
||||||
std::vector<int32_t> zero_point_output = {scales_zp.second};
|
std::vector<int32_t> zero_point_output = {scales_zp.second};
|
||||||
|
|
||||||
tim::vx::Quantization quant_input(tim::vx::QuantType::ASYMMETRIC, 2,
|
tim::vx::Quantization quant_input(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
scales_input, zero_point_input);
|
scales_input, zero_point_input);
|
||||||
tim::vx::Quantization quant_weight(tim::vx::QuantType::ASYMMETRIC, 2,
|
tim::vx::Quantization quant_weight(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
scales_weight, zero_point_weight);
|
scales_weight, zero_point_weight);
|
||||||
tim::vx::Quantization quant_bias(tim::vx::QuantType::ASYMMETRIC, 2, scales_bias,
|
tim::vx::Quantization quant_bias(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
zero_point_bias);
|
scales_bias, zero_point_bias);
|
||||||
tim::vx::Quantization quant_output(tim::vx::QuantType::ASYMMETRIC, 2,
|
tim::vx::Quantization quant_output(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
scales_output, zero_point_output);
|
scales_output, zero_point_output);
|
||||||
|
|
||||||
tim::vx::TensorSpec input_spec(tim::vx::DataType::UINT8, input_shape,
|
tim::vx::TensorSpec input_spec(tim::vx::DataType::UINT8, input_shape,
|
||||||
tim::vx::TensorAttribute::INPUT, quant_input);
|
tim::vx::TensorAttribute::INPUT, quant_input);
|
||||||
|
|
@ -1047,8 +1050,8 @@ TEST(Conv2d, shape_4_2_1_2_uint8_QuantizedTest1) {
|
||||||
|
|
||||||
std::vector<u_int8_t> input_data =
|
std::vector<u_int8_t> input_data =
|
||||||
Quantize<uint8_t>(input_data_float, scales_input[0], zero_point_input[0]);
|
Quantize<uint8_t>(input_data_float, scales_input[0], zero_point_input[0]);
|
||||||
std::vector<u_int8_t> weight_data =
|
std::vector<u_int8_t> weight_data = Quantize<uint8_t>(
|
||||||
Quantize<uint8_t>(weight_data_float, scales_weight[0], zero_point_input[0]);
|
weight_data_float, scales_weight[0], zero_point_input[0]);
|
||||||
std::vector<int32_t> bias_data =
|
std::vector<int32_t> bias_data =
|
||||||
Quantize<int32_t>(bias_data_float, scales_bias[0], zero_point_bias[0]);
|
Quantize<int32_t>(bias_data_float, scales_bias[0], zero_point_bias[0]);
|
||||||
std::vector<u_int8_t> golden =
|
std::vector<u_int8_t> golden =
|
||||||
|
|
@ -1062,8 +1065,8 @@ TEST(Conv2d, shape_4_2_1_2_uint8_QuantizedTest1) {
|
||||||
std::array<uint32_t, 2> stride({2, 2});
|
std::array<uint32_t, 2> stride({2, 2});
|
||||||
std::array<uint32_t, 2> dilation({1, 1});
|
std::array<uint32_t, 2> dilation({1, 1});
|
||||||
|
|
||||||
auto conv2d = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
auto conv2d =
|
||||||
padding, stride, dilation);
|
graph->CreateOperation<tim::vx::ops::Conv2d>(padding, stride, dilation);
|
||||||
(*conv2d)
|
(*conv2d)
|
||||||
.BindInput(input_tensor)
|
.BindInput(input_tensor)
|
||||||
.BindInput(weight_tensor)
|
.BindInput(weight_tensor)
|
||||||
|
|
@ -1094,8 +1097,8 @@ TEST(Conv2d, shape_4_2_1_2_uint8_QuantizedTest2) {
|
||||||
tim::vx::ShapeType output_shape(
|
tim::vx::ShapeType output_shape(
|
||||||
{2, 1, weight_shape[3], input_shape[3]}); //whcn
|
{2, 1, weight_shape[3], input_shape[3]}); //whcn
|
||||||
|
|
||||||
float input_min = -128.5, input_max = 128, weight_min = -128.5, weight_max = 128,
|
float input_min = -128.5, input_max = 128, weight_min = -128.5,
|
||||||
output_min = -127, output_max = 128;
|
weight_max = 128, output_min = -127, output_max = 128;
|
||||||
|
|
||||||
std::pair<float, int32_t> scales_zp;
|
std::pair<float, int32_t> scales_zp;
|
||||||
|
|
||||||
|
|
@ -1115,13 +1118,13 @@ TEST(Conv2d, shape_4_2_1_2_uint8_QuantizedTest2) {
|
||||||
std::vector<int32_t> zero_point_output = {scales_zp.second};
|
std::vector<int32_t> zero_point_output = {scales_zp.second};
|
||||||
|
|
||||||
tim::vx::Quantization quant_input(tim::vx::QuantType::ASYMMETRIC, 2,
|
tim::vx::Quantization quant_input(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
scales_input, zero_point_input);
|
scales_input, zero_point_input);
|
||||||
tim::vx::Quantization quant_weight(tim::vx::QuantType::ASYMMETRIC, 2,
|
tim::vx::Quantization quant_weight(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
scales_weight, zero_point_weight);
|
scales_weight, zero_point_weight);
|
||||||
tim::vx::Quantization quant_bias(tim::vx::QuantType::ASYMMETRIC, 2, scales_bias,
|
tim::vx::Quantization quant_bias(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
zero_point_bias);
|
scales_bias, zero_point_bias);
|
||||||
tim::vx::Quantization quant_output(tim::vx::QuantType::ASYMMETRIC, 2,
|
tim::vx::Quantization quant_output(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
scales_output, zero_point_output);
|
scales_output, zero_point_output);
|
||||||
|
|
||||||
tim::vx::TensorSpec input_spec(tim::vx::DataType::UINT8, input_shape,
|
tim::vx::TensorSpec input_spec(tim::vx::DataType::UINT8, input_shape,
|
||||||
tim::vx::TensorAttribute::INPUT, quant_input);
|
tim::vx::TensorAttribute::INPUT, quant_input);
|
||||||
|
|
@ -1151,8 +1154,8 @@ TEST(Conv2d, shape_4_2_1_2_uint8_QuantizedTest2) {
|
||||||
|
|
||||||
std::vector<u_int8_t> input_data =
|
std::vector<u_int8_t> input_data =
|
||||||
Quantize<uint8_t>(input_data_float, scales_input[0], zero_point_input[0]);
|
Quantize<uint8_t>(input_data_float, scales_input[0], zero_point_input[0]);
|
||||||
std::vector<u_int8_t> weight_data =
|
std::vector<u_int8_t> weight_data = Quantize<uint8_t>(
|
||||||
Quantize<uint8_t>(weight_data_float, scales_weight[0], zero_point_input[0]);
|
weight_data_float, scales_weight[0], zero_point_input[0]);
|
||||||
std::vector<int32_t> bias_data =
|
std::vector<int32_t> bias_data =
|
||||||
Quantize<int32_t>(bias_data_float, scales_bias[0], zero_point_bias[0]);
|
Quantize<int32_t>(bias_data_float, scales_bias[0], zero_point_bias[0]);
|
||||||
std::vector<u_int8_t> golden =
|
std::vector<u_int8_t> golden =
|
||||||
|
|
@ -1167,8 +1170,8 @@ TEST(Conv2d, shape_4_2_1_2_uint8_QuantizedTest2) {
|
||||||
std::array<uint32_t, 2> stride({2, 2});
|
std::array<uint32_t, 2> stride({2, 2});
|
||||||
std::array<uint32_t, 2> dilation({1, 1});
|
std::array<uint32_t, 2> dilation({1, 1});
|
||||||
|
|
||||||
auto conv2d = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
auto conv2d =
|
||||||
padding, stride, dilation);
|
graph->CreateOperation<tim::vx::ops::Conv2d>(padding, stride, dilation);
|
||||||
(*conv2d)
|
(*conv2d)
|
||||||
.BindInput(input_tensor)
|
.BindInput(input_tensor)
|
||||||
.BindInput(weight_tensor)
|
.BindInput(weight_tensor)
|
||||||
|
|
@ -1220,13 +1223,13 @@ TEST(Conv2d, shape_6_3_1_1_uint8_AnisotropicStridesQuantizedTest) {
|
||||||
std::vector<int32_t> zero_point_output = {scales_zp.second};
|
std::vector<int32_t> zero_point_output = {scales_zp.second};
|
||||||
|
|
||||||
tim::vx::Quantization quant_input(tim::vx::QuantType::ASYMMETRIC, 2,
|
tim::vx::Quantization quant_input(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
scales_input, zero_point_input);
|
scales_input, zero_point_input);
|
||||||
tim::vx::Quantization quant_weight(tim::vx::QuantType::ASYMMETRIC, 2,
|
tim::vx::Quantization quant_weight(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
scales_weight, zero_point_weight);
|
scales_weight, zero_point_weight);
|
||||||
tim::vx::Quantization quant_bias(tim::vx::QuantType::ASYMMETRIC, 2, scales_bias,
|
tim::vx::Quantization quant_bias(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
zero_point_bias);
|
scales_bias, zero_point_bias);
|
||||||
tim::vx::Quantization quant_output(tim::vx::QuantType::ASYMMETRIC, 2,
|
tim::vx::Quantization quant_output(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
scales_output, zero_point_output);
|
scales_output, zero_point_output);
|
||||||
|
|
||||||
tim::vx::TensorSpec input_spec(tim::vx::DataType::UINT8, input_shape,
|
tim::vx::TensorSpec input_spec(tim::vx::DataType::UINT8, input_shape,
|
||||||
tim::vx::TensorAttribute::INPUT, quant_input);
|
tim::vx::TensorAttribute::INPUT, quant_input);
|
||||||
|
|
@ -1255,8 +1258,8 @@ TEST(Conv2d, shape_6_3_1_1_uint8_AnisotropicStridesQuantizedTest) {
|
||||||
|
|
||||||
std::vector<u_int8_t> input_data =
|
std::vector<u_int8_t> input_data =
|
||||||
Quantize<uint8_t>(input_data_float, scales_input[0], zero_point_input[0]);
|
Quantize<uint8_t>(input_data_float, scales_input[0], zero_point_input[0]);
|
||||||
std::vector<u_int8_t> weight_data =
|
std::vector<u_int8_t> weight_data = Quantize<uint8_t>(
|
||||||
Quantize<uint8_t>(weight_data_float, scales_weight[0], zero_point_input[0]);
|
weight_data_float, scales_weight[0], zero_point_input[0]);
|
||||||
std::vector<int32_t> bias_data =
|
std::vector<int32_t> bias_data =
|
||||||
Quantize<int32_t>(bias_data_float, scales_bias[0], zero_point_bias[0]);
|
Quantize<int32_t>(bias_data_float, scales_bias[0], zero_point_bias[0]);
|
||||||
std::vector<u_int8_t> golden =
|
std::vector<u_int8_t> golden =
|
||||||
|
|
@ -1271,8 +1274,8 @@ TEST(Conv2d, shape_6_3_1_1_uint8_AnisotropicStridesQuantizedTest) {
|
||||||
std::array<uint32_t, 2> stride({3, 1});
|
std::array<uint32_t, 2> stride({3, 1});
|
||||||
std::array<uint32_t, 2> dilation({1, 1});
|
std::array<uint32_t, 2> dilation({1, 1});
|
||||||
|
|
||||||
auto conv2d = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
auto conv2d =
|
||||||
padding, stride, dilation);
|
graph->CreateOperation<tim::vx::ops::Conv2d>(padding, stride, dilation);
|
||||||
(*conv2d)
|
(*conv2d)
|
||||||
.BindInput(input_tensor)
|
.BindInput(input_tensor)
|
||||||
.BindInput(weight_tensor)
|
.BindInput(weight_tensor)
|
||||||
|
|
@ -1324,13 +1327,13 @@ TEST(Conv2d, shape_9_9_1_1_uint8_DilationQuantizedTest) {
|
||||||
std::vector<int32_t> zero_point_output = {scales_zp.second};
|
std::vector<int32_t> zero_point_output = {scales_zp.second};
|
||||||
|
|
||||||
tim::vx::Quantization quant_input(tim::vx::QuantType::ASYMMETRIC, 2,
|
tim::vx::Quantization quant_input(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
scales_input, zero_point_input);
|
scales_input, zero_point_input);
|
||||||
tim::vx::Quantization quant_weight(tim::vx::QuantType::ASYMMETRIC, 2,
|
tim::vx::Quantization quant_weight(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
scales_weight, zero_point_weight);
|
scales_weight, zero_point_weight);
|
||||||
tim::vx::Quantization quant_bias(tim::vx::QuantType::ASYMMETRIC, 2, scales_bias,
|
tim::vx::Quantization quant_bias(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
zero_point_bias);
|
scales_bias, zero_point_bias);
|
||||||
tim::vx::Quantization quant_output(tim::vx::QuantType::ASYMMETRIC, 2,
|
tim::vx::Quantization quant_output(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
scales_output, zero_point_output);
|
scales_output, zero_point_output);
|
||||||
|
|
||||||
tim::vx::TensorSpec input_spec(tim::vx::DataType::UINT8, input_shape,
|
tim::vx::TensorSpec input_spec(tim::vx::DataType::UINT8, input_shape,
|
||||||
tim::vx::TensorAttribute::INPUT, quant_input);
|
tim::vx::TensorAttribute::INPUT, quant_input);
|
||||||
|
|
@ -1362,8 +1365,8 @@ TEST(Conv2d, shape_9_9_1_1_uint8_DilationQuantizedTest) {
|
||||||
|
|
||||||
std::vector<u_int8_t> input_data =
|
std::vector<u_int8_t> input_data =
|
||||||
Quantize<uint8_t>(input_data_float, scales_input[0], zero_point_input[0]);
|
Quantize<uint8_t>(input_data_float, scales_input[0], zero_point_input[0]);
|
||||||
std::vector<u_int8_t> weight_data =
|
std::vector<u_int8_t> weight_data = Quantize<uint8_t>(
|
||||||
Quantize<uint8_t>(weight_data_float, scales_weight[0], zero_point_input[0]);
|
weight_data_float, scales_weight[0], zero_point_input[0]);
|
||||||
std::vector<int32_t> bias_data =
|
std::vector<int32_t> bias_data =
|
||||||
Quantize<int32_t>(bias_data_float, scales_bias[0], zero_point_bias[0]);
|
Quantize<int32_t>(bias_data_float, scales_bias[0], zero_point_bias[0]);
|
||||||
std::vector<u_int8_t> golden =
|
std::vector<u_int8_t> golden =
|
||||||
|
|
@ -1378,8 +1381,8 @@ TEST(Conv2d, shape_9_9_1_1_uint8_DilationQuantizedTest) {
|
||||||
std::array<uint32_t, 2> stride({1, 1});
|
std::array<uint32_t, 2> stride({1, 1});
|
||||||
std::array<uint32_t, 2> dilation({3, 3});
|
std::array<uint32_t, 2> dilation({3, 3});
|
||||||
|
|
||||||
auto conv2d = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
auto conv2d =
|
||||||
padding, stride, dilation);
|
graph->CreateOperation<tim::vx::ops::Conv2d>(padding, stride, dilation);
|
||||||
(*conv2d)
|
(*conv2d)
|
||||||
.BindInput(input_tensor)
|
.BindInput(input_tensor)
|
||||||
.BindInput(weight_tensor)
|
.BindInput(weight_tensor)
|
||||||
|
|
@ -1431,13 +1434,13 @@ TEST(Conv2d, shape_3_2_2_1_int8_QuantizedPerTensorTest) {
|
||||||
std::vector<int32_t> zero_point_output = {scales_zp.second};
|
std::vector<int32_t> zero_point_output = {scales_zp.second};
|
||||||
|
|
||||||
tim::vx::Quantization quant_input(tim::vx::QuantType::ASYMMETRIC, 2,
|
tim::vx::Quantization quant_input(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
scales_input, zero_point_input);
|
scales_input, zero_point_input);
|
||||||
tim::vx::Quantization quant_weight(tim::vx::QuantType::ASYMMETRIC, 2,
|
tim::vx::Quantization quant_weight(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
scales_weight, zero_point_weight);
|
scales_weight, zero_point_weight);
|
||||||
tim::vx::Quantization quant_bias(tim::vx::QuantType::ASYMMETRIC, 2, scales_bias,
|
tim::vx::Quantization quant_bias(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
zero_point_bias);
|
scales_bias, zero_point_bias);
|
||||||
tim::vx::Quantization quant_output(tim::vx::QuantType::ASYMMETRIC, 2,
|
tim::vx::Quantization quant_output(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
scales_output, zero_point_output);
|
scales_output, zero_point_output);
|
||||||
|
|
||||||
tim::vx::TensorSpec input_spec(tim::vx::DataType::INT8, input_shape,
|
tim::vx::TensorSpec input_spec(tim::vx::DataType::INT8, input_shape,
|
||||||
tim::vx::TensorAttribute::INPUT, quant_input);
|
tim::vx::TensorAttribute::INPUT, quant_input);
|
||||||
|
|
@ -1481,8 +1484,8 @@ TEST(Conv2d, shape_3_2_2_1_int8_QuantizedPerTensorTest) {
|
||||||
std::array<uint32_t, 2> stride({1, 1});
|
std::array<uint32_t, 2> stride({1, 1});
|
||||||
std::array<uint32_t, 2> dilation({1, 1});
|
std::array<uint32_t, 2> dilation({1, 1});
|
||||||
|
|
||||||
auto conv2d = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
auto conv2d =
|
||||||
padding, stride, dilation);
|
graph->CreateOperation<tim::vx::ops::Conv2d>(padding, stride, dilation);
|
||||||
(*conv2d)
|
(*conv2d)
|
||||||
.BindInput(input_tensor)
|
.BindInput(input_tensor)
|
||||||
.BindInput(weight_tensor)
|
.BindInput(weight_tensor)
|
||||||
|
|
@ -1527,7 +1530,7 @@ TEST(Conv2d, shape_3_2_2_1_int8_QuantizedPerChannelTest) {
|
||||||
std::vector<int32_t> zero_point_weight = {0, 0};
|
std::vector<int32_t> zero_point_weight = {0, 0};
|
||||||
|
|
||||||
std::vector<float> scales_bias = {scales_input[0] * scales_weight[0],
|
std::vector<float> scales_bias = {scales_input[0] * scales_weight[0],
|
||||||
scales_input[0] * scales_weight[1]};
|
scales_input[0] * scales_weight[1]};
|
||||||
std::vector<int32_t> zero_point_bias = {0, 0};
|
std::vector<int32_t> zero_point_bias = {0, 0};
|
||||||
|
|
||||||
scales_zp = QuantizationParams<int8_t>(output_min, output_max);
|
scales_zp = QuantizationParams<int8_t>(output_min, output_max);
|
||||||
|
|
@ -1535,13 +1538,13 @@ TEST(Conv2d, shape_3_2_2_1_int8_QuantizedPerChannelTest) {
|
||||||
std::vector<int32_t> zero_point_output = {scales_zp.second};
|
std::vector<int32_t> zero_point_output = {scales_zp.second};
|
||||||
|
|
||||||
tim::vx::Quantization quant_input(tim::vx::QuantType::ASYMMETRIC, 2,
|
tim::vx::Quantization quant_input(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
scales_input, zero_point_input);
|
scales_input, zero_point_input);
|
||||||
tim::vx::Quantization quant_weight(tim::vx::QuantType::SYMMETRIC_PER_CHANNEL,
|
tim::vx::Quantization quant_weight(tim::vx::QuantType::SYMMETRIC_PER_CHANNEL,
|
||||||
3, scales_weight, zero_point_weight);
|
3, scales_weight, zero_point_weight);
|
||||||
tim::vx::Quantization quant_bias(tim::vx::QuantType::SYMMETRIC_PER_CHANNEL, 0,
|
tim::vx::Quantization quant_bias(tim::vx::QuantType::SYMMETRIC_PER_CHANNEL, 0,
|
||||||
scales_bias, zero_point_bias);
|
scales_bias, zero_point_bias);
|
||||||
tim::vx::Quantization quant_output(tim::vx::QuantType::ASYMMETRIC, 2,
|
tim::vx::Quantization quant_output(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
scales_output, zero_point_output);
|
scales_output, zero_point_output);
|
||||||
|
|
||||||
tim::vx::TensorSpec input_spec(tim::vx::DataType::INT8, input_shape,
|
tim::vx::TensorSpec input_spec(tim::vx::DataType::INT8, input_shape,
|
||||||
tim::vx::TensorAttribute::INPUT, quant_input);
|
tim::vx::TensorAttribute::INPUT, quant_input);
|
||||||
|
|
@ -1583,8 +1586,8 @@ TEST(Conv2d, shape_3_2_2_1_int8_QuantizedPerChannelTest) {
|
||||||
std::array<uint32_t, 2> stride({1, 1});
|
std::array<uint32_t, 2> stride({1, 1});
|
||||||
std::array<uint32_t, 2> dilation({1, 1});
|
std::array<uint32_t, 2> dilation({1, 1});
|
||||||
|
|
||||||
auto conv2d = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
auto conv2d =
|
||||||
padding, stride, dilation);
|
graph->CreateOperation<tim::vx::ops::Conv2d>(padding, stride, dilation);
|
||||||
(*conv2d)
|
(*conv2d)
|
||||||
.BindInput(input_tensor)
|
.BindInput(input_tensor)
|
||||||
.BindInput(weight_tensor)
|
.BindInput(weight_tensor)
|
||||||
|
|
@ -1609,12 +1612,12 @@ TEST(Conv2d, shape_3_2_2_1_int8_QuantizedPerChannelTest) {
|
||||||
TEST(Conv2d, shape_w_h_128_1_ksize_1_1_stride_2_int8_QuantizedPerChannelTest) {
|
TEST(Conv2d, shape_w_h_128_1_ksize_1_1_stride_2_int8_QuantizedPerChannelTest) {
|
||||||
std::map<uint32_t, std::vector<uint32_t>> input_shape_list;
|
std::map<uint32_t, std::vector<uint32_t>> input_shape_list;
|
||||||
input_shape_list[32] = {18, 20, 22, 26, 28, 30, 34, 36, 38,
|
input_shape_list[32] = {18, 20, 22, 26, 28, 30, 34, 36, 38,
|
||||||
42, 44, 46, 50, 52, 54, 58, 60, 62};
|
42, 44, 46, 50, 52, 54, 58, 60, 62};
|
||||||
input_shape_list[63] = {18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62};
|
input_shape_list[63] = {18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62};
|
||||||
input_shape_list[95] = {18, 20, 22, 26, 28, 30, 34, 36, 38,
|
input_shape_list[95] = {18, 20, 22, 26, 28, 30, 34, 36, 38,
|
||||||
42, 44, 46, 50, 52, 54, 58, 60, 62};
|
42, 44, 46, 50, 52, 54, 58, 60, 62};
|
||||||
input_shape_list[96] = {18, 20, 22, 26, 28, 30, 34, 36, 38,
|
input_shape_list[96] = {18, 20, 22, 26, 28, 30, 34, 36, 38,
|
||||||
42, 44, 46, 50, 52, 54, 58, 60, 62};
|
42, 44, 46, 50, 52, 54, 58, 60, 62};
|
||||||
tim::vx::ShapeType input_shape({2, 2, 128, 1}); //whcn
|
tim::vx::ShapeType input_shape({2, 2, 128, 1}); //whcn
|
||||||
tim::vx::ShapeType weight_shape({1, 1, 128, 256}); //whio
|
tim::vx::ShapeType weight_shape({1, 1, 128, 256}); //whio
|
||||||
tim::vx::ShapeType bias_shape({weight_shape[3]});
|
tim::vx::ShapeType bias_shape({weight_shape[3]});
|
||||||
|
|
@ -1642,13 +1645,13 @@ TEST(Conv2d, shape_w_h_128_1_ksize_1_1_stride_2_int8_QuantizedPerChannelTest) {
|
||||||
std::vector<int32_t> zero_point_output = {-1};
|
std::vector<int32_t> zero_point_output = {-1};
|
||||||
|
|
||||||
tim::vx::Quantization quant_input(tim::vx::QuantType::ASYMMETRIC, 2,
|
tim::vx::Quantization quant_input(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
scales_input, zero_point_input);
|
scales_input, zero_point_input);
|
||||||
tim::vx::Quantization quant_weight(tim::vx::QuantType::SYMMETRIC_PER_CHANNEL,
|
tim::vx::Quantization quant_weight(tim::vx::QuantType::SYMMETRIC_PER_CHANNEL,
|
||||||
3, scales_weight, zero_point_weight);
|
3, scales_weight, zero_point_weight);
|
||||||
tim::vx::Quantization quant_bias(tim::vx::QuantType::SYMMETRIC_PER_CHANNEL, 0,
|
tim::vx::Quantization quant_bias(tim::vx::QuantType::SYMMETRIC_PER_CHANNEL, 0,
|
||||||
scales_bias, zero_point_bias);
|
scales_bias, zero_point_bias);
|
||||||
tim::vx::Quantization quant_output(tim::vx::QuantType::ASYMMETRIC, 2,
|
tim::vx::Quantization quant_output(tim::vx::QuantType::ASYMMETRIC, 2,
|
||||||
scales_output, zero_point_output);
|
scales_output, zero_point_output);
|
||||||
|
|
||||||
uint32_t weight_size =
|
uint32_t weight_size =
|
||||||
weight_shape[0] * weight_shape[1] * weight_shape[2] * weight_shape[3];
|
weight_shape[0] * weight_shape[1] * weight_shape[2] * weight_shape[3];
|
||||||
|
|
@ -1699,8 +1702,8 @@ TEST(Conv2d, shape_w_h_128_1_ksize_1_1_stride_2_int8_QuantizedPerChannelTest) {
|
||||||
for (uint32_t i = 0; i < golden_size; i++) {
|
for (uint32_t i = 0; i < golden_size; i++) {
|
||||||
golden_float[i] = 129;
|
golden_float[i] = 129;
|
||||||
}
|
}
|
||||||
std::vector<int8_t> golden =
|
std::vector<int8_t> golden = Quantize<int8_t>(
|
||||||
Quantize<int8_t>(golden_float, scales_output[0], zero_point_output[0]);
|
golden_float, scales_output[0], zero_point_output[0]);
|
||||||
|
|
||||||
auto ctx = tim::vx::Context::Create();
|
auto ctx = tim::vx::Context::Create();
|
||||||
auto graph = ctx->CreateGraph();
|
auto graph = ctx->CreateGraph();
|
||||||
|
|
@ -1738,28 +1741,30 @@ TEST(Conv2d, shape_w_h_128_1_ksize_1_1_stride_2_int8_QuantizedPerChannelTest) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(Conv2d, shape_4_2_2_2_int16_DFPQuantizedTest){
|
TEST(Conv2d, shape_4_2_2_2_int16_DFPQuantizedTest) {
|
||||||
auto ctx = tim::vx::Context::Create();
|
auto ctx = tim::vx::Context::Create();
|
||||||
if(ctx->isClOnly()) GTEST_SKIP();
|
if (ctx->isClOnly()) GTEST_SKIP();
|
||||||
auto graph = ctx->CreateGraph();
|
auto graph = ctx->CreateGraph();
|
||||||
tim::vx::ShapeType input_shape({4, 2, 2, 2}); //whcn
|
tim::vx::ShapeType input_shape({4, 2, 2, 2}); //whcn
|
||||||
tim::vx::ShapeType weight_shape({1, 1, 2, 1}); //whio
|
tim::vx::ShapeType weight_shape({1, 1, 2, 1}); //whio
|
||||||
tim::vx::ShapeType bias_shape({weight_shape[3]});
|
tim::vx::ShapeType bias_shape({weight_shape[3]});
|
||||||
tim::vx::ShapeType output_shape(
|
tim::vx::ShapeType output_shape(
|
||||||
{4, 2, weight_shape[3], input_shape[3]}); //whcn
|
{4, 2, weight_shape[3], input_shape[3]}); //whcn
|
||||||
int8_t fl_input = 9, fl_weight= 8, fl_output = 8;
|
int8_t fl_input = 9, fl_weight = 8, fl_output = 8;
|
||||||
tim::vx::Quantization quant_input(tim::vx::QuantType::DYNAMIC_FIXED_POINT, fl_input);
|
tim::vx::Quantization quant_input(tim::vx::QuantType::DYNAMIC_FIXED_POINT,
|
||||||
tim::vx::Quantization quant_weight(tim::vx::QuantType::DYNAMIC_FIXED_POINT, fl_weight);
|
fl_input);
|
||||||
tim::vx::Quantization quant_output(tim::vx::QuantType::DYNAMIC_FIXED_POINT, fl_output);
|
tim::vx::Quantization quant_weight(tim::vx::QuantType::DYNAMIC_FIXED_POINT,
|
||||||
|
fl_weight);
|
||||||
|
tim::vx::Quantization quant_output(tim::vx::QuantType::DYNAMIC_FIXED_POINT,
|
||||||
|
fl_output);
|
||||||
tim::vx::TensorSpec input_spec(tim::vx::DataType::INT16, input_shape,
|
tim::vx::TensorSpec input_spec(tim::vx::DataType::INT16, input_shape,
|
||||||
tim::vx::TensorAttribute::INPUT,
|
tim::vx::TensorAttribute::INPUT, quant_input);
|
||||||
quant_input);
|
|
||||||
tim::vx::TensorSpec weight_spec(tim::vx::DataType::INT16, weight_shape,
|
tim::vx::TensorSpec weight_spec(tim::vx::DataType::INT16, weight_shape,
|
||||||
tim::vx::TensorAttribute::CONSTANT,
|
tim::vx::TensorAttribute::CONSTANT,
|
||||||
quant_weight);
|
quant_weight);
|
||||||
tim::vx::TensorSpec output_spec(tim::vx::DataType::INT16, output_shape,
|
tim::vx::TensorSpec output_spec(tim::vx::DataType::INT16, output_shape,
|
||||||
tim::vx::TensorAttribute::OUTPUT,
|
tim::vx::TensorAttribute::OUTPUT,
|
||||||
quant_output);
|
quant_output);
|
||||||
|
|
||||||
// Input data float
|
// Input data float
|
||||||
std::vector<float> input_data_float = {
|
std::vector<float> input_data_float = {
|
||||||
|
|
@ -1767,25 +1772,23 @@ TEST(Conv2d, shape_4_2_2_2_int16_DFPQuantizedTest){
|
||||||
0.5, 1, 1.5, 2, 0.5, 1, 1.5, 2, 0.5, 1, 1.5, 2, 0.5, 1, 1.5, 2};
|
0.5, 1, 1.5, 2, 0.5, 1, 1.5, 2, 0.5, 1, 1.5, 2, 0.5, 1, 1.5, 2};
|
||||||
|
|
||||||
// weight data float
|
// weight data float
|
||||||
std::vector<float> weight_data_float= {
|
std::vector<float> weight_data_float = {
|
||||||
1, 2 // first filter
|
1, 2 // first filter
|
||||||
};
|
};
|
||||||
//input data(dfp16)
|
//input data(dfp16)
|
||||||
std::vector<int16_t> input_data = {
|
std::vector<int16_t> input_data = {256, 256, 256, 256, 512, 512, 512, 512,
|
||||||
256,256,256,256, 512,512,512,512, 256,256,256,256,512,512,512,512,
|
256, 256, 256, 256, 512, 512, 512, 512,
|
||||||
256,512,768,1024,256,512,768,1024,256,512,768,1024,256,512,768,1024
|
256, 512, 768, 1024, 256, 512, 768, 1024,
|
||||||
};
|
256, 512, 768, 1024, 256, 512, 768, 1024};
|
||||||
//weight data(dfp16)
|
//weight data(dfp16)
|
||||||
std::vector<int16_t> weight_data = {
|
std::vector<int16_t> weight_data = {256, 512};
|
||||||
256,512
|
|
||||||
};
|
|
||||||
// bias data
|
// bias data
|
||||||
std::vector<int64_t> bias_data = {0};
|
std::vector<int64_t> bias_data = {0};
|
||||||
//golden
|
//golden
|
||||||
std::vector<float> golden = {1.5, 1.5, 1.5, 1.5, 3, 3, 3, 3,
|
std::vector<float> golden = {1.5, 1.5, 1.5, 1.5, 3, 3, 3, 3,
|
||||||
1.5, 3, 4.5, 6, 1.5, 3, 4.5, 6};
|
1.5, 3, 4.5, 6, 1.5, 3, 4.5, 6};
|
||||||
|
|
||||||
auto input_tensor = graph->CreateTensor(input_spec,input_data.data());
|
auto input_tensor = graph->CreateTensor(input_spec, input_data.data());
|
||||||
auto weight_tensor = graph->CreateTensor(weight_spec, weight_data.data());
|
auto weight_tensor = graph->CreateTensor(weight_spec, weight_data.data());
|
||||||
auto output_tensor = graph->CreateTensor(output_spec);
|
auto output_tensor = graph->CreateTensor(output_spec);
|
||||||
|
|
||||||
|
|
@ -1793,8 +1796,8 @@ TEST(Conv2d, shape_4_2_2_2_int16_DFPQuantizedTest){
|
||||||
std::array<uint32_t, 2> stride({1, 1});
|
std::array<uint32_t, 2> stride({1, 1});
|
||||||
std::array<uint32_t, 2> dilation({0, 0});
|
std::array<uint32_t, 2> dilation({0, 0});
|
||||||
|
|
||||||
auto conv2d = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
auto conv2d =
|
||||||
padding, stride, dilation);
|
graph->CreateOperation<tim::vx::ops::Conv2d>(padding, stride, dilation);
|
||||||
(*conv2d)
|
(*conv2d)
|
||||||
.BindInput(input_tensor)
|
.BindInput(input_tensor)
|
||||||
.BindInput(weight_tensor)
|
.BindInput(weight_tensor)
|
||||||
|
|
@ -1812,16 +1815,16 @@ TEST(Conv2d, shape_4_2_2_2_int16_DFPQuantizedTest){
|
||||||
}
|
}
|
||||||
std::vector<int16_t> output(output_size);
|
std::vector<int16_t> output(output_size);
|
||||||
EXPECT_TRUE(output_tensor->CopyDataFromTensor(output.data()));
|
EXPECT_TRUE(output_tensor->CopyDataFromTensor(output.data()));
|
||||||
//transform output(int16) to fp
|
//transform output(int16) to fp
|
||||||
std::vector<float> f;
|
std::vector<float> f;
|
||||||
for(const auto& q : output){
|
for (const auto& q : output) {
|
||||||
f.push_back( q / (float)((int64_t)1 << fl_output));
|
f.push_back(q / (float)((int64_t)1 << fl_output));
|
||||||
}
|
}
|
||||||
EXPECT_EQ(golden, f);
|
EXPECT_EQ(golden, f);
|
||||||
}
|
}
|
||||||
TEST(Conv2d, shape_4_2_1_1_int16_DFPQuantizedTest) {
|
TEST(Conv2d, shape_4_2_1_1_int16_DFPQuantizedTest) {
|
||||||
auto ctx = tim::vx::Context::Create();
|
auto ctx = tim::vx::Context::Create();
|
||||||
if(ctx->isClOnly()) GTEST_SKIP();
|
if (ctx->isClOnly()) GTEST_SKIP();
|
||||||
auto graph = ctx->CreateGraph();
|
auto graph = ctx->CreateGraph();
|
||||||
|
|
||||||
tim::vx::ShapeType input_shape({4, 2, 1, 1}); //whcn
|
tim::vx::ShapeType input_shape({4, 2, 1, 1}); //whcn
|
||||||
|
|
@ -1829,32 +1832,34 @@ TEST(Conv2d, shape_4_2_1_1_int16_DFPQuantizedTest) {
|
||||||
tim::vx::ShapeType bias_shape({weight_shape[3]});
|
tim::vx::ShapeType bias_shape({weight_shape[3]});
|
||||||
tim::vx::ShapeType output_shape(
|
tim::vx::ShapeType output_shape(
|
||||||
{4, 2, weight_shape[3], input_shape[3]}); //whcn
|
{4, 2, weight_shape[3], input_shape[3]}); //whcn
|
||||||
int8_t fl_input = 9, fl_weight = 8, fl_bias = 17,fl_output = 8;
|
int8_t fl_input = 9, fl_weight = 8, fl_bias = 17, fl_output = 8;
|
||||||
|
|
||||||
tim::vx::Quantization quant_input(tim::vx::QuantType::DYNAMIC_FIXED_POINT, fl_input);
|
tim::vx::Quantization quant_input(tim::vx::QuantType::DYNAMIC_FIXED_POINT,
|
||||||
tim::vx::Quantization quant_weight(tim::vx::QuantType::DYNAMIC_FIXED_POINT, fl_weight);
|
fl_input);
|
||||||
tim::vx::Quantization quant_bias(tim::vx::QuantType::DYNAMIC_FIXED_POINT, fl_bias);
|
tim::vx::Quantization quant_weight(tim::vx::QuantType::DYNAMIC_FIXED_POINT,
|
||||||
tim::vx::Quantization quant_output(tim::vx::QuantType::DYNAMIC_FIXED_POINT, fl_output);
|
fl_weight);
|
||||||
|
tim::vx::Quantization quant_bias(tim::vx::QuantType::DYNAMIC_FIXED_POINT,
|
||||||
|
fl_bias);
|
||||||
|
tim::vx::Quantization quant_output(tim::vx::QuantType::DYNAMIC_FIXED_POINT,
|
||||||
|
fl_output);
|
||||||
tim::vx::TensorSpec input_spec(tim::vx::DataType::INT16, input_shape,
|
tim::vx::TensorSpec input_spec(tim::vx::DataType::INT16, input_shape,
|
||||||
tim::vx::TensorAttribute::INPUT,
|
tim::vx::TensorAttribute::INPUT, quant_input);
|
||||||
quant_input);
|
|
||||||
tim::vx::TensorSpec weight_spec(tim::vx::DataType::INT16, weight_shape,
|
tim::vx::TensorSpec weight_spec(tim::vx::DataType::INT16, weight_shape,
|
||||||
tim::vx::TensorAttribute::CONSTANT,
|
tim::vx::TensorAttribute::CONSTANT,
|
||||||
quant_weight);
|
quant_weight);
|
||||||
tim::vx::TensorSpec bias_spec(tim::vx::DataType::INT64, bias_shape,
|
tim::vx::TensorSpec bias_spec(tim::vx::DataType::INT64, bias_shape,
|
||||||
tim::vx::TensorAttribute::CONSTANT,
|
tim::vx::TensorAttribute::CONSTANT, quant_bias);
|
||||||
quant_bias);
|
|
||||||
tim::vx::TensorSpec output_spec(tim::vx::DataType::INT16, output_shape,
|
tim::vx::TensorSpec output_spec(tim::vx::DataType::INT16, output_shape,
|
||||||
tim::vx::TensorAttribute::OUTPUT,
|
tim::vx::TensorAttribute::OUTPUT,
|
||||||
quant_output);
|
quant_output);
|
||||||
// Input data nchw
|
// Input data nchw
|
||||||
std::vector<float> input_data_float= {
|
std::vector<float> input_data_float = {
|
||||||
1, 1, 1, 1, // row = 1
|
1, 1, 1, 1, // row = 1
|
||||||
2, 2, 3, 2 // row = 2
|
2, 2, 3, 2 // row = 2
|
||||||
};
|
};
|
||||||
|
|
||||||
// weight data oihw
|
// weight data oihw
|
||||||
std::vector<float> weight_data_float= {
|
std::vector<float> weight_data_float = {
|
||||||
1, 2, 3, 4, //first 2x2 filter
|
1, 2, 3, 4, //first 2x2 filter
|
||||||
-1, 1, -1, 1, // second 2x2 filter
|
-1, 1, -1, 1, // second 2x2 filter
|
||||||
-1, -1, 1, 1, // third 2x2 filter
|
-1, -1, 1, 1, // third 2x2 filter
|
||||||
|
|
@ -1865,24 +1870,18 @@ TEST(Conv2d, shape_4_2_1_1_int16_DFPQuantizedTest) {
|
||||||
|
|
||||||
// nchw
|
// nchw
|
||||||
std::vector<float> golden = {// first channel
|
std::vector<float> golden = {// first channel
|
||||||
18, 22, 21, 8, 7, 9, 8, 3,
|
18, 22, 21, 8, 7, 9, 8, 3,
|
||||||
// second channel
|
// second channel
|
||||||
2, 3, 1, -1, 2, 3, 1, 0,
|
2, 3, 1, -1, 2, 3, 1, 0,
|
||||||
// third channel
|
// third channel
|
||||||
5, 6, 6, 4, -1, -2, -2, 1};
|
5, 6, 6, 4, -1, -2, -2, 1};
|
||||||
|
|
||||||
std::vector<int16_t> input_data = {
|
std::vector<int16_t> input_data = {512, 512, 512, 512,
|
||||||
512, 512, 512, 512,
|
1024, 1024, 1536, 1024};
|
||||||
1024,1024,1536,1024
|
std::vector<int16_t> weight_data = {256, 512, 768, 1024, -256, 256,
|
||||||
};
|
-256, 256, -256, -256, 256, 256};
|
||||||
std::vector<int16_t> weight_data = {
|
std::vector<int64_t> bias_data = {1 << fl_bias, 2 * (1 << fl_bias),
|
||||||
256,512,768,1024,
|
3 * (1 << fl_bias)};
|
||||||
-256,256,-256,256,
|
|
||||||
-256,-256,256,256
|
|
||||||
};
|
|
||||||
std::vector<int64_t> bias_data = {
|
|
||||||
1<<fl_bias, 2*(1<<fl_bias),3*(1<<fl_bias)
|
|
||||||
};
|
|
||||||
|
|
||||||
auto input_tensor = graph->CreateTensor(input_spec, input_data.data());
|
auto input_tensor = graph->CreateTensor(input_spec, input_data.data());
|
||||||
auto weight_tensor = graph->CreateTensor(weight_spec, weight_data.data());
|
auto weight_tensor = graph->CreateTensor(weight_spec, weight_data.data());
|
||||||
|
|
@ -1894,8 +1893,8 @@ TEST(Conv2d, shape_4_2_1_1_int16_DFPQuantizedTest) {
|
||||||
std::array<uint32_t, 2> stride({1, 1});
|
std::array<uint32_t, 2> stride({1, 1});
|
||||||
std::array<uint32_t, 2> dilation({0, 0});
|
std::array<uint32_t, 2> dilation({0, 0});
|
||||||
|
|
||||||
auto conv2d = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
auto conv2d =
|
||||||
padding, stride, dilation);
|
graph->CreateOperation<tim::vx::ops::Conv2d>(padding, stride, dilation);
|
||||||
(*conv2d)
|
(*conv2d)
|
||||||
.BindInput(input_tensor)
|
.BindInput(input_tensor)
|
||||||
.BindInput(weight_tensor)
|
.BindInput(weight_tensor)
|
||||||
|
|
@ -1916,8 +1915,8 @@ TEST(Conv2d, shape_4_2_1_1_int16_DFPQuantizedTest) {
|
||||||
EXPECT_TRUE(output_tensor->CopyDataFromTensor(output.data()));
|
EXPECT_TRUE(output_tensor->CopyDataFromTensor(output.data()));
|
||||||
//transform output(int16) to fp
|
//transform output(int16) to fp
|
||||||
std::vector<float> f;
|
std::vector<float> f;
|
||||||
for(const auto& q : output){
|
for (const auto& q : output) {
|
||||||
f.push_back( q / (float)((int64_t)1 << fl_output));
|
f.push_back(q / (float)((int64_t)1 << fl_output));
|
||||||
}
|
}
|
||||||
EXPECT_EQ(golden, f);
|
EXPECT_EQ(golden, f);
|
||||||
}
|
}
|
||||||
|
|
@ -1926,7 +1925,7 @@ TEST(Conv2d, kernel_bigger_than_input_SAME) {
|
||||||
auto ctx = tim::vx::Context::Create();
|
auto ctx = tim::vx::Context::Create();
|
||||||
auto graph = ctx->CreateGraph();
|
auto graph = ctx->CreateGraph();
|
||||||
|
|
||||||
tim::vx::ShapeType input_shape({2, 3, 1, 1}); //whcn
|
tim::vx::ShapeType input_shape({2, 3, 1, 1}); //whcn
|
||||||
tim::vx::ShapeType kernel_shape({3, 2, 1, 1}); //whio
|
tim::vx::ShapeType kernel_shape({3, 2, 1, 1}); //whio
|
||||||
tim::vx::ShapeType bias_shape({1});
|
tim::vx::ShapeType bias_shape({1});
|
||||||
tim::vx::ShapeType output_shape({2, 3, 1, 1});
|
tim::vx::ShapeType output_shape({2, 3, 1, 1});
|
||||||
|
|
@ -1939,13 +1938,16 @@ TEST(Conv2d, kernel_bigger_than_input_SAME) {
|
||||||
tim::vx::TensorSpec output_spec(tim::vx::DataType::FLOAT32, output_shape,
|
tim::vx::TensorSpec output_spec(tim::vx::DataType::FLOAT32, output_shape,
|
||||||
tim::vx::TensorAttribute::OUTPUT);
|
tim::vx::TensorAttribute::OUTPUT);
|
||||||
|
|
||||||
std::vector<float> input_data = {1.0f, 3.0f, 4.0f, 2.0f, 2.0f, 3.0f,
|
std::vector<float> input_data = {
|
||||||
};
|
1.0f, 3.0f, 4.0f, 2.0f, 2.0f, 3.0f,
|
||||||
std::vector<float> weight = {100.0f, 20.0f, 1.0f, 200.0f, 10.0f, 2.0f,
|
};
|
||||||
};
|
std::vector<float> weight = {
|
||||||
|
100.0f, 20.0f, 1.0f, 200.0f, 10.0f, 2.0f,
|
||||||
|
};
|
||||||
std::vector<float> bias = {500.0f};
|
std::vector<float> bias = {500.0f};
|
||||||
std::vector<float> golden = {567.0f, 1480.0f, 608.0f, 1370.0f,
|
std::vector<float> golden = {
|
||||||
543.0f, 760.0f, };
|
567.0f, 1480.0f, 608.0f, 1370.0f, 543.0f, 760.0f,
|
||||||
|
};
|
||||||
auto input_tensor = graph->CreateTensor(input_spec);
|
auto input_tensor = graph->CreateTensor(input_spec);
|
||||||
auto weight_tensor = graph->CreateTensor(kernel_spec, weight.data());
|
auto weight_tensor = graph->CreateTensor(kernel_spec, weight.data());
|
||||||
auto bias_tensor = graph->CreateTensor(bias_spec, bias.data());
|
auto bias_tensor = graph->CreateTensor(bias_spec, bias.data());
|
||||||
|
|
@ -1956,7 +1958,9 @@ TEST(Conv2d, kernel_bigger_than_input_SAME) {
|
||||||
auto op = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
auto op = graph->CreateOperation<tim::vx::ops::Conv2d>(
|
||||||
tim::vx::PadType::SAME, strides, dilations, 0, tim::vx::DataLayout::WHCN,
|
tim::vx::PadType::SAME, strides, dilations, 0, tim::vx::DataLayout::WHCN,
|
||||||
tim::vx::DataLayout::IcWHOc);
|
tim::vx::DataLayout::IcWHOc);
|
||||||
(*op).BindInputs({input_tensor, weight_tensor, bias_tensor}).BindOutputs({output_tensor});
|
(*op)
|
||||||
|
.BindInputs({input_tensor, weight_tensor, bias_tensor})
|
||||||
|
.BindOutputs({output_tensor});
|
||||||
|
|
||||||
EXPECT_TRUE(graph->Compile());
|
EXPECT_TRUE(graph->Compile());
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,184 @@
|
||||||
|
Release Notes {#changelog}
|
||||||
|
=============
|
||||||
|
|
||||||
|
1.12.0 release (2017-03-06):
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
- Changed behaviour of `half_cast` to perform conversions to/from `double`
|
||||||
|
and `long double` directly according to specified rounding mode, without an
|
||||||
|
intermediate `float` conversion.
|
||||||
|
- Added `noexcept` specifiers to constructors.
|
||||||
|
- Fixed minor portability problem with `logb` and `ilogb`.
|
||||||
|
- Tested for *VC++ 2015*.
|
||||||
|
|
||||||
|
|
||||||
|
1.11.0 release (2013-11-16):
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
- Made tie-breaking behaviour in round to nearest configurable by
|
||||||
|
`HALF_ROUND_TIES_TO_EVEN` macro.
|
||||||
|
- Completed support for all C++11 mathematical functions even if single-
|
||||||
|
precision versions from `<cmath>` are unsupported.
|
||||||
|
- Fixed inability to disable support for C++11 mathematical functions on
|
||||||
|
*VC++ 2013*.
|
||||||
|
|
||||||
|
|
||||||
|
1.10.0 release (2013-11-09):
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
- Made default rounding mode configurable by `HALF_ROUND_STYLE` macro.
|
||||||
|
- Added support for non-IEEE single-precision implementations.
|
||||||
|
- Added `HALF_ENABLE_CPP11_TYPE_TRAITS` preprocessor flag for checking
|
||||||
|
support for C++11 type traits and TMP features.
|
||||||
|
- Restricted `half_cast` to support built-in arithmetic types only.
|
||||||
|
- Changed behaviour of `half_cast` to respect rounding mode when casting
|
||||||
|
to/from integer types.
|
||||||
|
|
||||||
|
|
||||||
|
1.9.2 release (2013-11-01):
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
- Tested for *gcc 4.8*.
|
||||||
|
- Tested and fixed for *VC++ 2013*.
|
||||||
|
- Removed unnecessary warnings in *MSVC*.
|
||||||
|
|
||||||
|
|
||||||
|
1.9.1 release (2013-08-08):
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
- Fixed problems with older gcc and MSVC versions.
|
||||||
|
- Small fix to non-C++11 implementations of `remainder` and `remquo`.
|
||||||
|
|
||||||
|
|
||||||
|
1.9.0 release (2013-08-07):
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
- Changed behaviour of `nearbyint`, `rint`, `lrint` and `llrint` to use
|
||||||
|
rounding mode of half-precision implementation (which is
|
||||||
|
truncating/indeterminate) instead of single-precision rounding mode.
|
||||||
|
- Added support for more C++11 mathematical functions even if single-
|
||||||
|
precision versions from `<cmath>` are unsupported, in particular
|
||||||
|
`remainder`, `remquo` and `cbrt`.
|
||||||
|
- Minor implementation changes.
|
||||||
|
|
||||||
|
|
||||||
|
1.8.1 release (2013-01-22):
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
- Fixed bug resulting in multiple definitions of the `nanh` function due to
|
||||||
|
a missing `inline` specification.
|
||||||
|
|
||||||
|
|
||||||
|
1.8.0 release (2013-01-19):
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
- Added support for more C++11 mathematical functions even if single-
|
||||||
|
precision versions from `<cmath>` are unsupported, in particular
|
||||||
|
exponential and logarithm functions, hyperbolic area functions and the
|
||||||
|
hypotenuse function.
|
||||||
|
- Made `fma` function use default implementation if single-precision version
|
||||||
|
from `<cmath>` is not faster and thus `FP_FAST_FMAH` to be defined always.
|
||||||
|
- Fixed overload resolution issues when invoking certain mathematical
|
||||||
|
functions by unqualified calls.
|
||||||
|
|
||||||
|
|
||||||
|
1.7.0 release (2012-10-26):
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
- Added support for C++11 `noexcept` specifiers.
|
||||||
|
- Changed C++11 `long long` to be supported on *VC++ 2003* and up.
|
||||||
|
|
||||||
|
|
||||||
|
1.6.1 release (2012-09-13):
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
- Made `fma` and `fdim` functions available even if corresponding
|
||||||
|
single-precision functions are not.
|
||||||
|
|
||||||
|
|
||||||
|
1.6.0 release (2012-09-12):
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
- Added `HALF_ENABLE_CPP11_LONG_LONG` to control support for `long long`
|
||||||
|
integers and corresponding mathematical functions.
|
||||||
|
- Fixed C++98 compatibility on non-VC compilers.
|
||||||
|
|
||||||
|
|
||||||
|
1.5.1 release (2012-08-17):
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
- Recorrected `std::numeric_limits::round_style` to always return
|
||||||
|
`std::round_indeterminate`, due to overflow-handling deviating from
|
||||||
|
correct round-toward-zero behaviour.
|
||||||
|
|
||||||
|
|
||||||
|
1.5.0 release (2012-08-16):
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
- Added `half_cast` for explicitly casting between half and any type
|
||||||
|
convertible to/from `float` and allowing the explicit specification of
|
||||||
|
the rounding mode to use.
|
||||||
|
|
||||||
|
|
||||||
|
1.4.0 release (2012-08-12):
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
- Added support for C++11 generalized constant expressions (`constexpr`).
|
||||||
|
|
||||||
|
|
||||||
|
1.3.1 release (2012-08-11):
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
- Fixed requirement for `std::signbit` and `std::isnan` (even if C++11
|
||||||
|
`<cmath>` functions disabled) on non-VC compilers.
|
||||||
|
|
||||||
|
|
||||||
|
1.3.0 release (2012-08-10):
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
- Made requirement for `<cstdint>` and `static_assert` optional and thus
|
||||||
|
made the library C++98-compatible.
|
||||||
|
- Made support for C++11 features user-overridable through explicit
|
||||||
|
definition of corresponding preprocessor symbols to either 0 or 1.
|
||||||
|
- Renamed `HALF_ENABLE_HASH` to `HALF_ENABLE_CPP11_HASH` in correspondence
|
||||||
|
with other C++11 preprocessor symbols.
|
||||||
|
|
||||||
|
|
||||||
|
1.2.0 release (2012-08-07):
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
- Added proper preprocessor definitions for `HUGE_VALH` and `FP_FAST_FMAH`
|
||||||
|
in correspondence with their single-precision counterparts from `<cmath>`.
|
||||||
|
- Fixed internal preprocessor macros to be properly undefined after use.
|
||||||
|
|
||||||
|
|
||||||
|
1.1.2 release (2012-08-07):
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
- Revised `std::numeric_limits::round_style` to return
|
||||||
|
`std::round_toward_zero` if the `float` version also does and
|
||||||
|
`std::round_indeterminate` otherwise.
|
||||||
|
- Fixed `std::numeric_limits::round_error` to reflect worst-case round
|
||||||
|
toward zero behaviour.
|
||||||
|
|
||||||
|
|
||||||
|
1.1.1 release (2012-08-06):
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
- Fixed `std::numeric_limits::min` to return smallest positive normal
|
||||||
|
number, instead of subnormal number.
|
||||||
|
- Fixed `std::numeric_limits::round_style` to return
|
||||||
|
`std::round_indeterminate` due to mixture of separately rounded
|
||||||
|
single-precision arithmetics with truncating single-to-half conversions.
|
||||||
|
|
||||||
|
|
||||||
|
1.1.0 release (2012-08-06):
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
- Added half-precision literals.
|
||||||
|
|
||||||
|
|
||||||
|
1.0.0 release (2012-08-05):
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
- First release.
|
||||||
|
|
@ -0,0 +1,4 @@
|
||||||
|
#
|
||||||
|
# Copyright (c) 2012-2017 Christian Rau
|
||||||
|
# SPDX-License-Identifier: MIT
|
||||||
|
#
|
||||||
|
|
@ -0,0 +1,7 @@
|
||||||
|
PackageName: half
|
||||||
|
SPDXID: SPDXRef-half
|
||||||
|
FilesAnalyzed: true
|
||||||
|
PackageLicenseConcluded: MIT
|
||||||
|
PackageLicenseInfoFromFiles: MIT
|
||||||
|
PackageLicenseDeclared: MIT
|
||||||
|
PackageCopyrightText:<text>Copyright (c) 2012-2017 Christian Rau <rauy@users.sourceforge.net></text>
|
||||||
|
|
@ -0,0 +1,21 @@
|
||||||
|
The MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2012-2017 Christian Rau
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in
|
||||||
|
all copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
|
THE SOFTWARE.
|
||||||
|
|
@ -0,0 +1,288 @@
|
||||||
|
HALF-PRECISION FLOATING POINT LIBRARY (Version 1.12.0)
|
||||||
|
------------------------------------------------------
|
||||||
|
|
||||||
|
This is a C++ header-only library to provide an IEEE 754 conformant 16-bit
|
||||||
|
half-precision floating point type along with corresponding arithmetic
|
||||||
|
operators, type conversions and common mathematical functions. It aims for both
|
||||||
|
efficiency and ease of use, trying to accurately mimic the behaviour of the
|
||||||
|
builtin floating point types at the best performance possible.
|
||||||
|
|
||||||
|
|
||||||
|
INSTALLATION AND REQUIREMENTS
|
||||||
|
-----------------------------
|
||||||
|
|
||||||
|
Comfortably enough, the library consists of just a single header file
|
||||||
|
containing all the functionality, which can be directly included by your
|
||||||
|
projects, without the neccessity to build anything or link to anything.
|
||||||
|
|
||||||
|
Whereas this library is fully C++98-compatible, it can profit from certain
|
||||||
|
C++11 features. Support for those features is checked automatically at compile
|
||||||
|
(or rather preprocessing) time, but can be explicitly enabled or disabled by
|
||||||
|
defining the corresponding preprocessor symbols to either 1 or 0 yourself. This
|
||||||
|
is useful when the automatic detection fails (for more exotic implementations)
|
||||||
|
or when a feature should be explicitly disabled:
|
||||||
|
|
||||||
|
- 'long long' integer type for mathematical functions returning 'long long'
|
||||||
|
results (enabled for VC++ 2003 and newer, gcc and clang, overridable with
|
||||||
|
'HALF_ENABLE_CPP11_LONG_LONG').
|
||||||
|
|
||||||
|
- Static assertions for extended compile-time checks (enabled for VC++ 2010,
|
||||||
|
gcc 4.3, clang 2.9 and newer, overridable with 'HALF_ENABLE_CPP11_STATIC_ASSERT').
|
||||||
|
|
||||||
|
- Generalized constant expressions (enabled for VC++ 2015, gcc 4.6, clang 3.1
|
||||||
|
and newer, overridable with 'HALF_ENABLE_CPP11_CONSTEXPR').
|
||||||
|
|
||||||
|
- noexcept exception specifications (enabled for VC++ 2015, gcc 4.6, clang 3.0
|
||||||
|
and newer, overridable with 'HALF_ENABLE_CPP11_NOEXCEPT').
|
||||||
|
|
||||||
|
- User-defined literals for half-precision literals to work (enabled for
|
||||||
|
VC++ 2015, gcc 4.7, clang 3.1 and newer, overridable with
|
||||||
|
'HALF_ENABLE_CPP11_USER_LITERALS').
|
||||||
|
|
||||||
|
- Type traits and template meta-programming features from <type_traits>
|
||||||
|
(enabled for VC++ 2010, libstdc++ 4.3, libc++ and newer, overridable with
|
||||||
|
'HALF_ENABLE_CPP11_TYPE_TRAITS').
|
||||||
|
|
||||||
|
- Special integer types from <cstdint> (enabled for VC++ 2010, libstdc++ 4.3,
|
||||||
|
libc++ and newer, overridable with 'HALF_ENABLE_CPP11_CSTDINT').
|
||||||
|
|
||||||
|
- Certain C++11 single-precision mathematical functions from <cmath> for
|
||||||
|
an improved implementation of their half-precision counterparts to work
|
||||||
|
(enabled for VC++ 2013, libstdc++ 4.3, libc++ and newer, overridable with
|
||||||
|
'HALF_ENABLE_CPP11_CMATH').
|
||||||
|
|
||||||
|
- Hash functor 'std::hash' from <functional> (enabled for VC++ 2010,
|
||||||
|
libstdc++ 4.3, libc++ and newer, overridable with 'HALF_ENABLE_CPP11_HASH').
|
||||||
|
|
||||||
|
The library has been tested successfully with Visual C++ 2005-2015, gcc 4.4-4.8
|
||||||
|
and clang 3.1. Please contact me if you have any problems, suggestions or even
|
||||||
|
just success testing it on other platforms.
|
||||||
|
|
||||||
|
|
||||||
|
DOCUMENTATION
|
||||||
|
-------------
|
||||||
|
|
||||||
|
Here follow some general words about the usage of the library and its
|
||||||
|
implementation. For a complete documentation of its iterface look at the
|
||||||
|
corresponding website http://half.sourceforge.net. You may also generate the
|
||||||
|
complete developer documentation from the library's only include file's doxygen
|
||||||
|
comments, but this is more relevant to developers rather than mere users (for
|
||||||
|
reasons described below).
|
||||||
|
|
||||||
|
BASIC USAGE
|
||||||
|
|
||||||
|
To make use of the library just include its only header file half.hpp, which
|
||||||
|
defines all half-precision functionality inside the 'half_float' namespace. The
|
||||||
|
actual 16-bit half-precision data type is represented by the 'half' type. This
|
||||||
|
type behaves like the builtin floating point types as much as possible,
|
||||||
|
supporting the usual arithmetic, comparison and streaming operators, which
|
||||||
|
makes its use pretty straight-forward:
|
||||||
|
|
||||||
|
using half_float::half;
|
||||||
|
half a(3.4), b(5);
|
||||||
|
half c = a * b;
|
||||||
|
c += 3;
|
||||||
|
if(c > a)
|
||||||
|
std::cout << c << std::endl;
|
||||||
|
|
||||||
|
Additionally the 'half_float' namespace also defines half-precision versions
|
||||||
|
for all mathematical functions of the C++ standard library, which can be used
|
||||||
|
directly through ADL:
|
||||||
|
|
||||||
|
half a(-3.14159);
|
||||||
|
half s = sin(abs(a));
|
||||||
|
long l = lround(s);
|
||||||
|
|
||||||
|
You may also specify explicit half-precision literals, since the library
|
||||||
|
provides a user-defined literal inside the 'half_float::literal' namespace,
|
||||||
|
which you just need to import (assuming support for C++11 user-defined literals):
|
||||||
|
|
||||||
|
using namespace half_float::literal;
|
||||||
|
half x = 1.0_h;
|
||||||
|
|
||||||
|
Furthermore the library provides proper specializations for
|
||||||
|
'std::numeric_limits', defining various implementation properties, and
|
||||||
|
'std::hash' for hashing half-precision numbers (assuming support for C++11
|
||||||
|
'std::hash'). Similar to the corresponding preprocessor symbols from <cmath>
|
||||||
|
the library also defines the 'HUGE_VALH' constant and maybe the 'FP_FAST_FMAH'
|
||||||
|
symbol.
|
||||||
|
|
||||||
|
CONVERSIONS AND ROUNDING
|
||||||
|
|
||||||
|
The half is explicitly constructible/convertible from a single-precision float
|
||||||
|
argument. Thus it is also explicitly constructible/convertible from any type
|
||||||
|
implicitly convertible to float, but constructing it from types like double or
|
||||||
|
int will involve the usual warnings arising when implicitly converting those to
|
||||||
|
float because of the lost precision. On the one hand those warnings are
|
||||||
|
intentional, because converting those types to half neccessarily also reduces
|
||||||
|
precision. But on the other hand they are raised for explicit conversions from
|
||||||
|
those types, when the user knows what he is doing. So if those warnings keep
|
||||||
|
bugging you, then you won't get around first explicitly converting to float
|
||||||
|
before converting to half, or use the 'half_cast' described below. In addition
|
||||||
|
you can also directly assign float values to halfs.
|
||||||
|
|
||||||
|
In contrast to the float-to-half conversion, which reduces precision, the
|
||||||
|
conversion from half to float (and thus to any other type implicitly
|
||||||
|
convertible from float) is implicit, because all values represetable with
|
||||||
|
half-precision are also representable with single-precision. This way the
|
||||||
|
half-to-float conversion behaves similar to the builtin float-to-double
|
||||||
|
conversion and all arithmetic expressions involving both half-precision and
|
||||||
|
single-precision arguments will be of single-precision type. This way you can
|
||||||
|
also directly use the mathematical functions of the C++ standard library,
|
||||||
|
though in this case you will invoke the single-precision versions which will
|
||||||
|
also return single-precision values, which is (even if maybe performing the
|
||||||
|
exact same computation, see below) not as conceptually clean when working in a
|
||||||
|
half-precision environment.
|
||||||
|
|
||||||
|
The default rounding mode for conversions from float to half uses truncation
|
||||||
|
(round toward zero, but mapping overflows to infinity) for rounding values not
|
||||||
|
representable exactly in half-precision. This is the fastest rounding possible
|
||||||
|
and is usually sufficient. But by redefining the 'HALF_ROUND_STYLE'
|
||||||
|
preprocessor symbol (before including half.hpp) this default can be overridden
|
||||||
|
with one of the other standard rounding modes using their respective constants
|
||||||
|
or the equivalent values of 'std::float_round_style' (it can even be
|
||||||
|
synchronized with the underlying single-precision implementation by defining it
|
||||||
|
to 'std::numeric_limits<float>::round_style'):
|
||||||
|
|
||||||
|
- 'std::round_indeterminate' or -1 for the fastest rounding (default).
|
||||||
|
|
||||||
|
- 'std::round_toward_zero' or 0 for rounding toward zero.
|
||||||
|
|
||||||
|
- std::round_to_nearest' or 1 for rounding to the nearest value.
|
||||||
|
|
||||||
|
- std::round_toward_infinity' or 2 for rounding toward positive infinity.
|
||||||
|
|
||||||
|
- std::round_toward_neg_infinity' or 3 for rounding toward negative infinity.
|
||||||
|
|
||||||
|
In addition to changing the overall default rounding mode one can also use the
|
||||||
|
'half_cast'. This converts between half and any built-in arithmetic type using
|
||||||
|
a configurable rounding mode (or the default rounding mode if none is
|
||||||
|
specified). In addition to a configurable rounding mode, 'half_cast' has
|
||||||
|
another big difference to a mere 'static_cast': Any conversions are performed
|
||||||
|
directly using the given rounding mode, without any intermediate conversion
|
||||||
|
to/from 'float'. This is especially relevant for conversions to integer types,
|
||||||
|
which don't necessarily truncate anymore. But also for conversions from
|
||||||
|
'double' or 'long double' this may produce more precise results than a
|
||||||
|
pre-conversion to 'float' using the single-precision implementation's current
|
||||||
|
rounding mode would.
|
||||||
|
|
||||||
|
half a = half_cast<half>(4.2);
|
||||||
|
half b = half_cast<half,std::numeric_limits<float>::round_style>(4.2f);
|
||||||
|
assert( half_cast<int, std::round_to_nearest>( 0.7_h ) == 1 );
|
||||||
|
assert( half_cast<half,std::round_toward_zero>( 4097 ) == 4096.0_h );
|
||||||
|
assert( half_cast<half,std::round_toward_infinity>( 4097 ) == 4100.0_h );
|
||||||
|
assert( half_cast<half,std::round_toward_infinity>( std::numeric_limits<double>::min() ) > 0.0_h );
|
||||||
|
|
||||||
|
When using round to nearest (either as default or through 'half_cast') ties are
|
||||||
|
by default resolved by rounding them away from zero (and thus equal to the
|
||||||
|
behaviour of the 'round' function). But by redefining the
|
||||||
|
'HALF_ROUND_TIES_TO_EVEN' preprocessor symbol to 1 (before including half.hpp)
|
||||||
|
this default can be changed to the slightly slower but less biased and more
|
||||||
|
IEEE-conformant behaviour of rounding half-way cases to the nearest even value.
|
||||||
|
|
||||||
|
#define HALF_ROUND_TIES_TO_EVEN 1
|
||||||
|
#include <half.hpp>
|
||||||
|
...
|
||||||
|
assert( half_cast<int,std::round_to_nearest>(3.5_h)
|
||||||
|
== half_cast<int,std::round_to_nearest>(4.5_h) );
|
||||||
|
|
||||||
|
IMPLEMENTATION
|
||||||
|
|
||||||
|
For performance reasons (and ease of implementation) many of the mathematical
|
||||||
|
functions provided by the library as well as all arithmetic operations are
|
||||||
|
actually carried out in single-precision under the hood, calling to the C++
|
||||||
|
standard library implementations of those functions whenever appropriate,
|
||||||
|
meaning the arguments are converted to floats and the result back to half. But
|
||||||
|
to reduce the conversion overhead as much as possible any temporary values
|
||||||
|
inside of lengthy expressions are kept in single-precision as long as possible,
|
||||||
|
while still maintaining a strong half-precision type to the outside world. Only
|
||||||
|
when finally assigning the value to a half or calling a function that works
|
||||||
|
directly on halfs is the actual conversion done (or never, when further
|
||||||
|
converting the result to float.
|
||||||
|
|
||||||
|
This approach has two implications. First of all you have to treat the
|
||||||
|
library's documentation at http://half.sourceforge.net as a simplified version,
|
||||||
|
describing the behaviour of the library as if implemented this way. The actual
|
||||||
|
argument and return types of functions and operators may involve other internal
|
||||||
|
types (feel free to generate the exact developer documentation from the Doxygen
|
||||||
|
comments in the library's header file if you really need to). But nevertheless
|
||||||
|
the behaviour is exactly like specified in the documentation. The other
|
||||||
|
implication is, that in the presence of rounding errors or over-/underflows
|
||||||
|
arithmetic expressions may produce different results when compared to
|
||||||
|
converting to half-precision after each individual operation:
|
||||||
|
|
||||||
|
half a = std::numeric_limits<half>::max() * 2.0_h / 2.0_h; // a = MAX
|
||||||
|
half b = half(std::numeric_limits<half>::max() * 2.0_h) / 2.0_h; // b = INF
|
||||||
|
assert( a != b );
|
||||||
|
|
||||||
|
But this should only be a problem in very few cases. One last word has to be
|
||||||
|
said when talking about performance. Even with its efforts in reducing
|
||||||
|
conversion overhead as much as possible, the software half-precision
|
||||||
|
implementation can most probably not beat the direct use of single-precision
|
||||||
|
computations. Usually using actual float values for all computations and
|
||||||
|
temproraries and using halfs only for storage is the recommended way. On the
|
||||||
|
one hand this somehow makes the provided mathematical functions obsolete
|
||||||
|
(especially in light of the implicit conversion from half to float), but
|
||||||
|
nevertheless the goal of this library was to provide a complete and
|
||||||
|
conceptually clean half-precision implementation, to which the standard
|
||||||
|
mathematical functions belong, even if usually not needed.
|
||||||
|
|
||||||
|
IEEE CONFORMANCE
|
||||||
|
|
||||||
|
The half type uses the standard IEEE representation with 1 sign bit, 5 exponent
|
||||||
|
bits and 10 mantissa bits (11 when counting the hidden bit). It supports all
|
||||||
|
types of special values, like subnormal values, infinity and NaNs. But there
|
||||||
|
are some limitations to the complete conformance to the IEEE 754 standard:
|
||||||
|
|
||||||
|
- The implementation does not differentiate between signalling and quiet
|
||||||
|
NaNs, this means operations on halfs are not specified to trap on
|
||||||
|
signalling NaNs (though they may, see last point).
|
||||||
|
|
||||||
|
- Though arithmetic operations are internally rounded to single-precision
|
||||||
|
using the underlying single-precision implementation's current rounding
|
||||||
|
mode, those values are then converted to half-precision using the default
|
||||||
|
half-precision rounding mode (changed by defining 'HALF_ROUND_STYLE'
|
||||||
|
accordingly). This mixture of rounding modes is also the reason why
|
||||||
|
'std::numeric_limits<half>::round_style' may actually return
|
||||||
|
'std::round_indeterminate' when half- and single-precision rounding modes
|
||||||
|
don't match.
|
||||||
|
|
||||||
|
- Because of internal truncation it may also be that certain single-precision
|
||||||
|
NaNs will be wrongly converted to half-precision infinity, though this is
|
||||||
|
very unlikely to happen, since most single-precision implementations don't
|
||||||
|
tend to only set the lowest bits of a NaN mantissa.
|
||||||
|
|
||||||
|
- The implementation does not provide any floating point exceptions, thus
|
||||||
|
arithmetic operations or mathematical functions are not specified to invoke
|
||||||
|
proper floating point exceptions. But due to many functions implemented in
|
||||||
|
single-precision, those may still invoke floating point exceptions of the
|
||||||
|
underlying single-precision implementation.
|
||||||
|
|
||||||
|
Some of those points could have been circumvented by controlling the floating
|
||||||
|
point environment using <cfenv> or implementing a similar exception mechanism.
|
||||||
|
But this would have required excessive runtime checks giving two high an impact
|
||||||
|
on performance for something that is rarely ever needed. If you really need to
|
||||||
|
rely on proper floating point exceptions, it is recommended to explicitly
|
||||||
|
perform computations using the built-in floating point types to be on the safe
|
||||||
|
side. In the same way, if you really need to rely on a particular rounding
|
||||||
|
behaviour, it is recommended to either use single-precision computations and
|
||||||
|
explicitly convert the result to half-precision using 'half_cast' and
|
||||||
|
specifying the desired rounding mode, or synchronize the default half-precision
|
||||||
|
rounding mode to the rounding mode of the single-precision implementation (most
|
||||||
|
likely 'HALF_ROUND_STYLE=1', 'HALF_ROUND_TIES_TO_EVEN=1'). But this is really
|
||||||
|
considered an expert-scenario that should be used only when necessary, since
|
||||||
|
actually working with half-precision usually comes with a certain
|
||||||
|
tolerance/ignorance of exactness considerations and proper rounding comes with
|
||||||
|
a certain performance cost.
|
||||||
|
|
||||||
|
|
||||||
|
CREDITS AND CONTACT
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
This library is developed by CHRISTIAN RAU and released under the MIT License
|
||||||
|
(see LICENSE.txt). If you have any questions or problems with it, feel free to
|
||||||
|
contact me at rauy@users.sourceforge.net.
|
||||||
|
|
||||||
|
Additional credit goes to JEROEN VAN DER ZIJP for his paper on "Fast Half Float
|
||||||
|
Conversions", whose algorithms have been used in the library for converting
|
||||||
|
between half-precision and single-precision values.
|
||||||
|
|
@ -0,0 +1,4 @@
|
||||||
|
#
|
||||||
|
# Copyright (c) 2012-2017 Christian Rau
|
||||||
|
# SPDX-License-Identifier: MIT
|
||||||
|
#
|
||||||
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue